mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
💫 Tidy up and auto-format tests (#2967)
* Auto-format tests with black * Add flake8 config * Tidy up and remove unused imports * Fix redefinitions of test functions * Replace orths_and_spaces with words and spaces * Fix compatibility with pytest 4.0 * xfail test for now Test was previously overwritten by following test due to naming conflict, so failure wasn't reported * Unfail passing test * Only use fixture via arguments Fixes pytest 4.0 compatibility
This commit is contained in:
parent
2c37e0ccf6
commit
b6e991440c
4
.flake8
Normal file
4
.flake8
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
[flake8]
|
||||||
|
ignore = E203, E266, E501, W503
|
||||||
|
max-line-length = 80
|
||||||
|
select = B,C,E,F,W,T4,B9
|
|
@ -11,7 +11,7 @@ ujson>=1.35
|
||||||
dill>=0.2,<0.3
|
dill>=0.2,<0.3
|
||||||
regex==2018.01.10
|
regex==2018.01.10
|
||||||
requests>=2.13.0,<3.0.0
|
requests>=2.13.0,<3.0.0
|
||||||
pytest>=3.6.0,<4.0.0
|
pytest>=4.0.0,<5.0.0
|
||||||
pytest-timeout>=1.3.0,<2.0.0
|
pytest-timeout>=1.3.0,<2.0.0
|
||||||
mock>=2.0.0,<3.0.0
|
mock>=2.0.0,<3.0.0
|
||||||
pathlib==1.0.1; python_version < "3.4"
|
pathlib==1.0.1; python_version < "3.4"
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from io import StringIO, BytesIO
|
|
||||||
from spacy.util import get_lang_class
|
from spacy.util import get_lang_class
|
||||||
|
|
||||||
|
|
||||||
|
@ -11,126 +10,135 @@ def pytest_addoption(parser):
|
||||||
|
|
||||||
|
|
||||||
def pytest_runtest_setup(item):
|
def pytest_runtest_setup(item):
|
||||||
for opt in ['slow']:
|
for opt in ["slow"]:
|
||||||
if opt in item.keywords and not item.config.getoption("--%s" % opt):
|
if opt in item.keywords and not item.config.getoption("--%s" % opt):
|
||||||
pytest.skip("need --%s option to run" % opt)
|
pytest.skip("need --%s option to run" % opt)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope="module")
|
||||||
def tokenizer():
|
def tokenizer():
|
||||||
return get_lang_class('xx').Defaults.create_tokenizer()
|
return get_lang_class("xx").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def en_tokenizer():
|
def en_tokenizer():
|
||||||
return get_lang_class('en').Defaults.create_tokenizer()
|
return get_lang_class("en").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def en_vocab():
|
def en_vocab():
|
||||||
return get_lang_class('en').Defaults.create_vocab()
|
return get_lang_class("en").Defaults.create_vocab()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def en_parser(en_vocab):
|
def en_parser(en_vocab):
|
||||||
nlp = get_lang_class('en')(en_vocab)
|
nlp = get_lang_class("en")(en_vocab)
|
||||||
return nlp.create_pipe('parser')
|
return nlp.create_pipe("parser")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def es_tokenizer():
|
def es_tokenizer():
|
||||||
return get_lang_class('es').Defaults.create_tokenizer()
|
return get_lang_class("es").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def de_tokenizer():
|
def de_tokenizer():
|
||||||
return get_lang_class('de').Defaults.create_tokenizer()
|
return get_lang_class("de").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def fr_tokenizer():
|
def fr_tokenizer():
|
||||||
return get_lang_class('fr').Defaults.create_tokenizer()
|
return get_lang_class("fr").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def hu_tokenizer():
|
def hu_tokenizer():
|
||||||
return get_lang_class('hu').Defaults.create_tokenizer()
|
return get_lang_class("hu").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def fi_tokenizer():
|
def fi_tokenizer():
|
||||||
return get_lang_class('fi').Defaults.create_tokenizer()
|
return get_lang_class("fi").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def ro_tokenizer():
|
def ro_tokenizer():
|
||||||
return get_lang_class('ro').Defaults.create_tokenizer()
|
return get_lang_class("ro").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def id_tokenizer():
|
def id_tokenizer():
|
||||||
return get_lang_class('id').Defaults.create_tokenizer()
|
return get_lang_class("id").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def sv_tokenizer():
|
def sv_tokenizer():
|
||||||
return get_lang_class('sv').Defaults.create_tokenizer()
|
return get_lang_class("sv").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def bn_tokenizer():
|
def bn_tokenizer():
|
||||||
return get_lang_class('bn').Defaults.create_tokenizer()
|
return get_lang_class("bn").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def ga_tokenizer():
|
def ga_tokenizer():
|
||||||
return get_lang_class('ga').Defaults.create_tokenizer()
|
return get_lang_class("ga").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def he_tokenizer():
|
def he_tokenizer():
|
||||||
return get_lang_class('he').Defaults.create_tokenizer()
|
return get_lang_class("he").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope="session")
|
||||||
def nb_tokenizer():
|
def nb_tokenizer():
|
||||||
return get_lang_class('nb').Defaults.create_tokenizer()
|
return get_lang_class("nb").Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
def da_tokenizer():
|
def da_tokenizer():
|
||||||
return get_lang_class('da').Defaults.create_tokenizer()
|
return get_lang_class("da").Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
def ja_tokenizer():
|
def ja_tokenizer():
|
||||||
mecab = pytest.importorskip("MeCab")
|
mecab = pytest.importorskip("MeCab")
|
||||||
return get_lang_class('ja').Defaults.create_tokenizer()
|
return get_lang_class("ja").Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
def th_tokenizer():
|
def th_tokenizer():
|
||||||
pythainlp = pytest.importorskip("pythainlp")
|
pythainlp = pytest.importorskip("pythainlp")
|
||||||
return get_lang_class('th').Defaults.create_tokenizer()
|
return get_lang_class("th").Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
def tr_tokenizer():
|
def tr_tokenizer():
|
||||||
return get_lang_class('tr').Defaults.create_tokenizer()
|
return get_lang_class("tr").Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
def tt_tokenizer():
|
def tt_tokenizer():
|
||||||
return get_lang_class('tt').Defaults.create_tokenizer()
|
return get_lang_class("tt").Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
def el_tokenizer():
|
def el_tokenizer():
|
||||||
return get_lang_class('el').Defaults.create_tokenizer()
|
return get_lang_class("el").Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
def ar_tokenizer():
|
def ar_tokenizer():
|
||||||
return get_lang_class('ar').Defaults.create_tokenizer()
|
return get_lang_class("ar").Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
def ur_tokenizer():
|
def ur_tokenizer():
|
||||||
return get_lang_class('ur').Defaults.create_tokenizer()
|
return get_lang_class("ur").Defaults.create_tokenizer()
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
def ru_tokenizer():
|
def ru_tokenizer():
|
||||||
pymorphy = pytest.importorskip('pymorphy2')
|
pymorphy = pytest.importorskip("pymorphy2")
|
||||||
return get_lang_class('ru').Defaults.create_tokenizer()
|
return get_lang_class("ru").Defaults.create_tokenizer()
|
||||||
|
|
|
@ -38,7 +38,7 @@ def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab):
|
||||||
|
|
||||||
def test_doc_array_tag(en_tokenizer):
|
def test_doc_array_tag(en_tokenizer):
|
||||||
text = "A nice sentence."
|
text = "A nice sentence."
|
||||||
pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']
|
pos = ["DET", "ADJ", "NOUN", "PUNCT"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], pos=pos)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], pos=pos)
|
||||||
assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos
|
assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos
|
||||||
|
@ -51,7 +51,7 @@ def test_doc_array_tag(en_tokenizer):
|
||||||
|
|
||||||
def test_doc_array_dep(en_tokenizer):
|
def test_doc_array_dep(en_tokenizer):
|
||||||
text = "A nice sentence."
|
text = "A nice sentence."
|
||||||
deps = ['det', 'amod', 'ROOT', 'punct']
|
deps = ["det", "amod", "ROOT", "punct"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
||||||
feats_array = doc.to_array((ORTH, DEP))
|
feats_array = doc.to_array((ORTH, DEP))
|
||||||
|
|
|
@ -9,7 +9,7 @@ from spacy.lemmatizer import Lemmatizer
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def lemmatizer():
|
def lemmatizer():
|
||||||
return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
|
return Lemmatizer(lookup={"dogs": "dog", "boxen": "box", "mice": "mouse"})
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -23,15 +23,15 @@ def test_empty_doc(vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_single_word(vocab):
|
def test_single_word(vocab):
|
||||||
doc = Doc(vocab, words=['a'])
|
doc = Doc(vocab, words=["a"])
|
||||||
assert doc.text == 'a '
|
assert doc.text == "a "
|
||||||
doc = Doc(vocab, words=['a'], spaces=[False])
|
doc = Doc(vocab, words=["a"], spaces=[False])
|
||||||
assert doc.text == 'a'
|
assert doc.text == "a"
|
||||||
|
|
||||||
|
|
||||||
def test_lookup_lemmatization(vocab):
|
def test_lookup_lemmatization(vocab):
|
||||||
doc = Doc(vocab, words=['dogs', 'dogses'])
|
doc = Doc(vocab, words=["dogs", "dogses"])
|
||||||
assert doc[0].text == 'dogs'
|
assert doc[0].text == "dogs"
|
||||||
assert doc[0].lemma_ == 'dog'
|
assert doc[0].lemma_ == "dog"
|
||||||
assert doc[1].text == 'dogses'
|
assert doc[1].text == "dogses"
|
||||||
assert doc[1].lemma_ == 'dogses'
|
assert doc[1].lemma_ == "dogses"
|
||||||
|
|
|
@ -10,7 +10,7 @@ from spacy.attrs import LEMMA
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', [["one", "two", "three"]])
|
@pytest.mark.parametrize("text", [["one", "two", "three"]])
|
||||||
def test_doc_api_compare_by_string_position(en_vocab, text):
|
def test_doc_api_compare_by_string_position(en_vocab, text):
|
||||||
doc = Doc(en_vocab, words=text)
|
doc = Doc(en_vocab, words=text)
|
||||||
# Get the tokens in this order, so their ID ordering doesn't match the idx
|
# Get the tokens in this order, so their ID ordering doesn't match the idx
|
||||||
|
@ -28,80 +28,81 @@ def test_doc_api_compare_by_string_position(en_vocab, text):
|
||||||
def test_doc_api_getitem(en_tokenizer):
|
def test_doc_api_getitem(en_tokenizer):
|
||||||
text = "Give it back! He pleaded."
|
text = "Give it back! He pleaded."
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert tokens[0].text == 'Give'
|
assert tokens[0].text == "Give"
|
||||||
assert tokens[-1].text == '.'
|
assert tokens[-1].text == "."
|
||||||
with pytest.raises(IndexError):
|
with pytest.raises(IndexError):
|
||||||
tokens[len(tokens)]
|
tokens[len(tokens)]
|
||||||
|
|
||||||
def to_str(span):
|
def to_str(span):
|
||||||
return '/'.join(token.text for token in span)
|
return "/".join(token.text for token in span)
|
||||||
|
|
||||||
span = tokens[1:1]
|
span = tokens[1:1]
|
||||||
assert not to_str(span)
|
assert not to_str(span)
|
||||||
span = tokens[1:4]
|
span = tokens[1:4]
|
||||||
assert to_str(span) == 'it/back/!'
|
assert to_str(span) == "it/back/!"
|
||||||
span = tokens[1:4:1]
|
span = tokens[1:4:1]
|
||||||
assert to_str(span) == 'it/back/!'
|
assert to_str(span) == "it/back/!"
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
tokens[1:4:2]
|
tokens[1:4:2]
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
tokens[1:4:-1]
|
tokens[1:4:-1]
|
||||||
|
|
||||||
span = tokens[-3:6]
|
span = tokens[-3:6]
|
||||||
assert to_str(span) == 'He/pleaded'
|
assert to_str(span) == "He/pleaded"
|
||||||
span = tokens[4:-1]
|
span = tokens[4:-1]
|
||||||
assert to_str(span) == 'He/pleaded'
|
assert to_str(span) == "He/pleaded"
|
||||||
span = tokens[-5:-3]
|
span = tokens[-5:-3]
|
||||||
assert to_str(span) == 'back/!'
|
assert to_str(span) == "back/!"
|
||||||
span = tokens[5:4]
|
span = tokens[5:4]
|
||||||
assert span.start == span.end == 5 and not to_str(span)
|
assert span.start == span.end == 5 and not to_str(span)
|
||||||
span = tokens[4:-3]
|
span = tokens[4:-3]
|
||||||
assert span.start == span.end == 4 and not to_str(span)
|
assert span.start == span.end == 4 and not to_str(span)
|
||||||
|
|
||||||
span = tokens[:]
|
span = tokens[:]
|
||||||
assert to_str(span) == 'Give/it/back/!/He/pleaded/.'
|
assert to_str(span) == "Give/it/back/!/He/pleaded/."
|
||||||
span = tokens[4:]
|
span = tokens[4:]
|
||||||
assert to_str(span) == 'He/pleaded/.'
|
assert to_str(span) == "He/pleaded/."
|
||||||
span = tokens[:4]
|
span = tokens[:4]
|
||||||
assert to_str(span) == 'Give/it/back/!'
|
assert to_str(span) == "Give/it/back/!"
|
||||||
span = tokens[:-3]
|
span = tokens[:-3]
|
||||||
assert to_str(span) == 'Give/it/back/!'
|
assert to_str(span) == "Give/it/back/!"
|
||||||
span = tokens[-3:]
|
span = tokens[-3:]
|
||||||
assert to_str(span) == 'He/pleaded/.'
|
assert to_str(span) == "He/pleaded/."
|
||||||
|
|
||||||
span = tokens[4:50]
|
span = tokens[4:50]
|
||||||
assert to_str(span) == 'He/pleaded/.'
|
assert to_str(span) == "He/pleaded/."
|
||||||
span = tokens[-50:4]
|
span = tokens[-50:4]
|
||||||
assert to_str(span) == 'Give/it/back/!'
|
assert to_str(span) == "Give/it/back/!"
|
||||||
span = tokens[-50:-40]
|
span = tokens[-50:-40]
|
||||||
assert span.start == span.end == 0 and not to_str(span)
|
assert span.start == span.end == 0 and not to_str(span)
|
||||||
span = tokens[40:50]
|
span = tokens[40:50]
|
||||||
assert span.start == span.end == 7 and not to_str(span)
|
assert span.start == span.end == 7 and not to_str(span)
|
||||||
|
|
||||||
span = tokens[1:4]
|
span = tokens[1:4]
|
||||||
assert span[0].orth_ == 'it'
|
assert span[0].orth_ == "it"
|
||||||
subspan = span[:]
|
subspan = span[:]
|
||||||
assert to_str(subspan) == 'it/back/!'
|
assert to_str(subspan) == "it/back/!"
|
||||||
subspan = span[:2]
|
subspan = span[:2]
|
||||||
assert to_str(subspan) == 'it/back'
|
assert to_str(subspan) == "it/back"
|
||||||
subspan = span[1:]
|
subspan = span[1:]
|
||||||
assert to_str(subspan) == 'back/!'
|
assert to_str(subspan) == "back/!"
|
||||||
subspan = span[:-1]
|
subspan = span[:-1]
|
||||||
assert to_str(subspan) == 'it/back'
|
assert to_str(subspan) == "it/back"
|
||||||
subspan = span[-2:]
|
subspan = span[-2:]
|
||||||
assert to_str(subspan) == 'back/!'
|
assert to_str(subspan) == "back/!"
|
||||||
subspan = span[1:2]
|
subspan = span[1:2]
|
||||||
assert to_str(subspan) == 'back'
|
assert to_str(subspan) == "back"
|
||||||
subspan = span[-2:-1]
|
subspan = span[-2:-1]
|
||||||
assert to_str(subspan) == 'back'
|
assert to_str(subspan) == "back"
|
||||||
subspan = span[-50:50]
|
subspan = span[-50:50]
|
||||||
assert to_str(subspan) == 'it/back/!'
|
assert to_str(subspan) == "it/back/!"
|
||||||
subspan = span[50:-50]
|
subspan = span[50:-50]
|
||||||
assert subspan.start == subspan.end == 4 and not to_str(subspan)
|
assert subspan.start == subspan.end == 4 and not to_str(subspan)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Give it back! He pleaded.",
|
@pytest.mark.parametrize(
|
||||||
" Give it back! He pleaded. "])
|
"text", ["Give it back! He pleaded.", " Give it back! He pleaded. "]
|
||||||
|
)
|
||||||
def test_doc_api_serialize(en_tokenizer, text):
|
def test_doc_api_serialize(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
|
||||||
|
@ -110,13 +111,15 @@ def test_doc_api_serialize(en_tokenizer, text):
|
||||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||||
|
|
||||||
new_tokens = Doc(tokens.vocab).from_bytes(
|
new_tokens = Doc(tokens.vocab).from_bytes(
|
||||||
tokens.to_bytes(tensor=False), tensor=False)
|
tokens.to_bytes(tensor=False), tensor=False
|
||||||
|
)
|
||||||
assert tokens.text == new_tokens.text
|
assert tokens.text == new_tokens.text
|
||||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||||
|
|
||||||
new_tokens = Doc(tokens.vocab).from_bytes(
|
new_tokens = Doc(tokens.vocab).from_bytes(
|
||||||
tokens.to_bytes(sentiment=False), sentiment=False)
|
tokens.to_bytes(sentiment=False), sentiment=False
|
||||||
|
)
|
||||||
assert tokens.text == new_tokens.text
|
assert tokens.text == new_tokens.text
|
||||||
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
assert [t.text for t in tokens] == [t.text for t in new_tokens]
|
||||||
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
|
||||||
|
@ -126,10 +129,10 @@ def test_doc_api_set_ents(en_tokenizer):
|
||||||
text = "I use goggle chrone to surf the web"
|
text = "I use goggle chrone to surf the web"
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens.ents) == 0
|
assert len(tokens.ents) == 0
|
||||||
tokens.ents = [(tokens.vocab.strings['PRODUCT'], 2, 4)]
|
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
|
||||||
assert len(list(tokens.ents)) == 1
|
assert len(list(tokens.ents)) == 1
|
||||||
assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
|
assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
|
||||||
assert tokens.ents[0].label_ == 'PRODUCT'
|
assert tokens.ents[0].label_ == "PRODUCT"
|
||||||
assert tokens.ents[0].start == 2
|
assert tokens.ents[0].start == 2
|
||||||
assert tokens.ents[0].end == 4
|
assert tokens.ents[0].end == 4
|
||||||
|
|
||||||
|
@ -140,21 +143,31 @@ def test_doc_api_merge(en_tokenizer):
|
||||||
# merge 'The Beach Boys'
|
# merge 'The Beach Boys'
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert len(doc) == 9
|
assert len(doc) == 9
|
||||||
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), tag='NAMED', lemma='LEMMA',
|
doc.merge(
|
||||||
ent_type='TYPE')
|
doc[4].idx,
|
||||||
|
doc[6].idx + len(doc[6]),
|
||||||
|
tag="NAMED",
|
||||||
|
lemma="LEMMA",
|
||||||
|
ent_type="TYPE",
|
||||||
|
)
|
||||||
assert len(doc) == 7
|
assert len(doc) == 7
|
||||||
assert doc[4].text == 'the beach boys'
|
assert doc[4].text == "the beach boys"
|
||||||
assert doc[4].text_with_ws == 'the beach boys '
|
assert doc[4].text_with_ws == "the beach boys "
|
||||||
assert doc[4].tag_ == 'NAMED'
|
assert doc[4].tag_ == "NAMED"
|
||||||
|
|
||||||
# merge 'all night'
|
# merge 'all night'
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert len(doc) == 9
|
assert len(doc) == 9
|
||||||
doc.merge(doc[7].idx, doc[8].idx + len(doc[8]), tag='NAMED', lemma='LEMMA',
|
doc.merge(
|
||||||
ent_type='TYPE')
|
doc[7].idx,
|
||||||
|
doc[8].idx + len(doc[8]),
|
||||||
|
tag="NAMED",
|
||||||
|
lemma="LEMMA",
|
||||||
|
ent_type="TYPE",
|
||||||
|
)
|
||||||
assert len(doc) == 8
|
assert len(doc) == 8
|
||||||
assert doc[7].text == 'all night'
|
assert doc[7].text == "all night"
|
||||||
assert doc[7].text_with_ws == 'all night'
|
assert doc[7].text_with_ws == "all night"
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_merge_children(en_tokenizer):
|
def test_doc_api_merge_children(en_tokenizer):
|
||||||
|
@ -162,8 +175,13 @@ def test_doc_api_merge_children(en_tokenizer):
|
||||||
text = "WKRO played songs by the beach boys all night"
|
text = "WKRO played songs by the beach boys all night"
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert len(doc) == 9
|
assert len(doc) == 9
|
||||||
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), tag='NAMED', lemma='LEMMA',
|
doc.merge(
|
||||||
ent_type='TYPE')
|
doc[4].idx,
|
||||||
|
doc[6].idx + len(doc[6]),
|
||||||
|
tag="NAMED",
|
||||||
|
lemma="LEMMA",
|
||||||
|
ent_type="TYPE",
|
||||||
|
)
|
||||||
|
|
||||||
for word in doc:
|
for word in doc:
|
||||||
if word.i < word.head.i:
|
if word.i < word.head.i:
|
||||||
|
@ -175,8 +193,8 @@ def test_doc_api_merge_children(en_tokenizer):
|
||||||
def test_doc_api_merge_hang(en_tokenizer):
|
def test_doc_api_merge_hang(en_tokenizer):
|
||||||
text = "through North and South Carolina"
|
text = "through North and South Carolina"
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
doc.merge(18, 32, tag='', lemma='', ent_type='ORG')
|
doc.merge(18, 32, tag="", lemma="", ent_type="ORG")
|
||||||
doc.merge(8, 32, tag='', lemma='', ent_type='ORG')
|
doc.merge(8, 32, tag="", lemma="", ent_type="ORG")
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_retokenizer(en_tokenizer):
|
def test_doc_api_retokenizer(en_tokenizer):
|
||||||
|
@ -184,19 +202,19 @@ def test_doc_api_retokenizer(en_tokenizer):
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[4:7])
|
retokenizer.merge(doc[4:7])
|
||||||
assert len(doc) == 7
|
assert len(doc) == 7
|
||||||
assert doc[4].text == 'the beach boys'
|
assert doc[4].text == "the beach boys"
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_retokenizer_attrs(en_tokenizer):
|
def test_doc_api_retokenizer_attrs(en_tokenizer):
|
||||||
doc = en_tokenizer("WKRO played songs by the beach boys all night")
|
doc = en_tokenizer("WKRO played songs by the beach boys all night")
|
||||||
# test both string and integer attributes and values
|
# test both string and integer attributes and values
|
||||||
attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']}
|
attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]}
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[4:7], attrs=attrs)
|
retokenizer.merge(doc[4:7], attrs=attrs)
|
||||||
assert len(doc) == 7
|
assert len(doc) == 7
|
||||||
assert doc[4].text == 'the beach boys'
|
assert doc[4].text == "the beach boys"
|
||||||
assert doc[4].lemma_ == 'boys'
|
assert doc[4].lemma_ == "boys"
|
||||||
assert doc[4].ent_type_ == 'ORG'
|
assert doc[4].ent_type_ == "ORG"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
|
@ -205,11 +223,11 @@ def test_doc_api_retokenizer_lex_attrs(en_tokenizer):
|
||||||
doc = en_tokenizer("WKRO played beach boys songs")
|
doc = en_tokenizer("WKRO played beach boys songs")
|
||||||
assert not any(token.is_stop for token in doc)
|
assert not any(token.is_stop for token in doc)
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[2:4], attrs={'LEMMA': 'boys', 'IS_STOP': True})
|
retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True})
|
||||||
assert doc[2].text == 'beach boys'
|
assert doc[2].text == "beach boys"
|
||||||
assert doc[2].lemma_ == 'boys'
|
assert doc[2].lemma_ == "boys"
|
||||||
assert doc[2].is_stop
|
assert doc[2].is_stop
|
||||||
new_doc = Doc(doc.vocab, words=['beach boys'])
|
new_doc = Doc(doc.vocab, words=["beach boys"])
|
||||||
assert new_doc[0].is_stop
|
assert new_doc[0].is_stop
|
||||||
|
|
||||||
|
|
||||||
|
@ -222,21 +240,25 @@ def test_doc_api_sents_empty_string(en_tokenizer):
|
||||||
|
|
||||||
def test_doc_api_runtime_error(en_tokenizer):
|
def test_doc_api_runtime_error(en_tokenizer):
|
||||||
# Example that caused run-time error while parsing Reddit
|
# Example that caused run-time error while parsing Reddit
|
||||||
|
# fmt: off
|
||||||
text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
|
text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
|
||||||
deps = ['nsubj', 'prep', 'amod', 'pobj', 'ROOT', 'amod', 'attr', '',
|
deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "",
|
||||||
'nummod', 'prep', 'det', 'amod', 'pobj', 'acl', 'prep', 'prep',
|
"nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep",
|
||||||
'pobj', '', 'nummod', 'prep', 'det', 'amod', 'pobj', 'aux', 'neg',
|
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
|
||||||
'ROOT', 'amod', 'dobj']
|
"ROOT", "amod", "dobj"]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
|
||||||
|
|
||||||
nps = []
|
nps = []
|
||||||
for np in doc.noun_chunks:
|
for np in doc.noun_chunks:
|
||||||
while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'):
|
while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
|
||||||
np = np[1:]
|
np = np[1:]
|
||||||
if len(np) > 1:
|
if len(np) > 1:
|
||||||
nps.append((np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_))
|
nps.append(
|
||||||
|
(np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_)
|
||||||
|
)
|
||||||
for np in nps:
|
for np in nps:
|
||||||
start, end, tag, lemma, ent_type = np
|
start, end, tag, lemma, ent_type = np
|
||||||
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
|
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
|
||||||
|
@ -244,57 +266,76 @@ def test_doc_api_runtime_error(en_tokenizer):
|
||||||
|
|
||||||
def test_doc_api_right_edge(en_tokenizer):
|
def test_doc_api_right_edge(en_tokenizer):
|
||||||
"""Test for bug occurring from Unshift action, causing incorrect right edge"""
|
"""Test for bug occurring from Unshift action, causing incorrect right edge"""
|
||||||
|
# fmt: off
|
||||||
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
|
||||||
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
|
||||||
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||||
assert doc[6].text == 'for'
|
assert doc[6].text == "for"
|
||||||
subtree = [w.text for w in doc[6].subtree]
|
subtree = [w.text for w in doc[6].subtree]
|
||||||
assert subtree == ['for', 'the', 'sake', 'of', 'such', 'as',
|
assert subtree == [
|
||||||
'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ',']
|
"for",
|
||||||
assert doc[6].right_edge.text == ','
|
"the",
|
||||||
|
"sake",
|
||||||
|
"of",
|
||||||
|
"such",
|
||||||
|
"as",
|
||||||
|
"live",
|
||||||
|
"under",
|
||||||
|
"the",
|
||||||
|
"government",
|
||||||
|
"of",
|
||||||
|
"the",
|
||||||
|
"Romans",
|
||||||
|
",",
|
||||||
|
]
|
||||||
|
assert doc[6].right_edge.text == ","
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_has_vector():
|
def test_doc_api_has_vector():
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
vocab.reset_vectors(width=2)
|
vocab.reset_vectors(width=2)
|
||||||
vocab.set_vector('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
|
vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f"))
|
||||||
doc = Doc(vocab, words=['kitten'])
|
doc = Doc(vocab, words=["kitten"])
|
||||||
assert doc.has_vector
|
assert doc.has_vector
|
||||||
|
|
||||||
|
|
||||||
def test_doc_api_similarity_match():
|
def test_doc_api_similarity_match():
|
||||||
doc = Doc(Vocab(), words=['a'])
|
doc = Doc(Vocab(), words=["a"])
|
||||||
with pytest.warns(None):
|
with pytest.warns(None):
|
||||||
assert doc.similarity(doc[0]) == 1.0
|
assert doc.similarity(doc[0]) == 1.0
|
||||||
assert doc.similarity(doc.vocab['a']) == 1.0
|
assert doc.similarity(doc.vocab["a"]) == 1.0
|
||||||
doc2 = Doc(doc.vocab, words=['a', 'b', 'c'])
|
doc2 = Doc(doc.vocab, words=["a", "b", "c"])
|
||||||
with pytest.warns(None):
|
with pytest.warns(None):
|
||||||
assert doc.similarity(doc2[:1]) == 1.0
|
assert doc.similarity(doc2[:1]) == 1.0
|
||||||
assert doc.similarity(doc2) == 0.0
|
assert doc.similarity(doc2) == 0.0
|
||||||
|
|
||||||
|
|
||||||
def test_lowest_common_ancestor(en_tokenizer):
|
def test_lowest_common_ancestor(en_tokenizer):
|
||||||
tokens = en_tokenizer('the lazy dog slept')
|
tokens = en_tokenizer("the lazy dog slept")
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
|
||||||
lca = doc.get_lca_matrix()
|
lca = doc.get_lca_matrix()
|
||||||
assert(lca[1, 1] == 1)
|
assert lca[1, 1] == 1
|
||||||
assert(lca[0, 1] == 2)
|
assert lca[0, 1] == 2
|
||||||
assert(lca[1, 2] == 2)
|
assert lca[1, 2] == 2
|
||||||
|
|
||||||
|
|
||||||
def test_parse_tree(en_tokenizer):
|
def test_parse_tree(en_tokenizer):
|
||||||
"""Tests doc.print_tree() method."""
|
"""Tests doc.print_tree() method."""
|
||||||
text = 'I like New York in Autumn.'
|
text = "I like New York in Autumn."
|
||||||
heads = [1, 0, 1, -2, -3, -1, -5]
|
heads = [1, 0, 1, -2, -3, -1, -5]
|
||||||
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
|
tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags)
|
||||||
# full method parse_tree(text) is a trivial composition
|
# full method parse_tree(text) is a trivial composition
|
||||||
trees = doc.print_tree()
|
trees = doc.print_tree()
|
||||||
assert len(trees) > 0
|
assert len(trees) > 0
|
||||||
tree = trees[0]
|
tree = trees[0]
|
||||||
assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers'])
|
assert all(
|
||||||
assert tree['word'] == 'like' # check root is correct
|
k in list(tree.keys())
|
||||||
|
for k in ["word", "lemma", "NE", "POS_fine", "POS_coarse", "arc", "modifiers"]
|
||||||
|
)
|
||||||
|
assert tree["word"] == "like" # check root is correct
|
||||||
|
|
|
@ -7,37 +7,38 @@ from spacy.compat import pickle, unicode_
|
||||||
|
|
||||||
def test_pickle_single_doc():
|
def test_pickle_single_doc():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
doc = nlp('pickle roundtrip')
|
doc = nlp("pickle roundtrip")
|
||||||
data = pickle.dumps(doc, 1)
|
data = pickle.dumps(doc, 1)
|
||||||
doc2 = pickle.loads(data)
|
doc2 = pickle.loads(data)
|
||||||
assert doc2.text == 'pickle roundtrip'
|
assert doc2.text == "pickle roundtrip"
|
||||||
|
|
||||||
|
|
||||||
def test_list_of_docs_pickles_efficiently():
|
def test_list_of_docs_pickles_efficiently():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
for i in range(10000):
|
for i in range(10000):
|
||||||
_ = nlp.vocab[unicode_(i)]
|
_ = nlp.vocab[unicode_(i)]
|
||||||
one_pickled = pickle.dumps(nlp('0'), -1)
|
one_pickled = pickle.dumps(nlp("0"), -1)
|
||||||
docs = list(nlp.pipe(unicode_(i) for i in range(100)))
|
docs = list(nlp.pipe(unicode_(i) for i in range(100)))
|
||||||
many_pickled = pickle.dumps(docs, -1)
|
many_pickled = pickle.dumps(docs, -1)
|
||||||
assert len(many_pickled) < (len(one_pickled) * 2)
|
assert len(many_pickled) < (len(one_pickled) * 2)
|
||||||
many_unpickled = pickle.loads(many_pickled)
|
many_unpickled = pickle.loads(many_pickled)
|
||||||
assert many_unpickled[0].text == '0'
|
assert many_unpickled[0].text == "0"
|
||||||
assert many_unpickled[-1].text == '99'
|
assert many_unpickled[-1].text == "99"
|
||||||
assert len(many_unpickled) == 100
|
assert len(many_unpickled) == 100
|
||||||
|
|
||||||
|
|
||||||
def test_user_data_from_disk():
|
def test_user_data_from_disk():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
doc = nlp('Hello')
|
doc = nlp("Hello")
|
||||||
doc.user_data[(0, 1)] = False
|
doc.user_data[(0, 1)] = False
|
||||||
b = doc.to_bytes()
|
b = doc.to_bytes()
|
||||||
doc2 = doc.__class__(doc.vocab).from_bytes(b)
|
doc2 = doc.__class__(doc.vocab).from_bytes(b)
|
||||||
assert doc2.user_data[(0, 1)] == False
|
assert doc2.user_data[(0, 1)] == False
|
||||||
|
|
||||||
|
|
||||||
def test_user_data_unpickles():
|
def test_user_data_unpickles():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
doc = nlp('Hello')
|
doc = nlp("Hello")
|
||||||
doc.user_data[(0, 1)] = False
|
doc.user_data[(0, 1)] = False
|
||||||
b = pickle.dumps(doc)
|
b = pickle.dumps(doc)
|
||||||
doc2 = pickle.loads(b)
|
doc2 = pickle.loads(b)
|
||||||
|
@ -46,10 +47,11 @@ def test_user_data_unpickles():
|
||||||
|
|
||||||
def test_hooks_unpickle():
|
def test_hooks_unpickle():
|
||||||
def inner_func(d1, d2):
|
def inner_func(d1, d2):
|
||||||
return 'hello!'
|
return "hello!"
|
||||||
|
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
doc = nlp('Hello')
|
doc = nlp("Hello")
|
||||||
doc.user_hooks['similarity'] = inner_func
|
doc.user_hooks["similarity"] = inner_func
|
||||||
b = pickle.dumps(doc)
|
b = pickle.dumps(doc)
|
||||||
doc2 = pickle.loads(b)
|
doc2 = pickle.loads(b)
|
||||||
assert doc2.similarity(None) == 'hello!'
|
assert doc2.similarity(None) == "hello!"
|
||||||
|
|
|
@ -11,10 +11,12 @@ from ..util import get_doc
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def doc(en_tokenizer):
|
def doc(en_tokenizer):
|
||||||
|
# fmt: off
|
||||||
text = "This is a sentence. This is another sentence. And a third."
|
text = "This is a sentence. This is another sentence. And a third."
|
||||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
|
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
|
||||||
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
|
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||||
'attr', 'punct', 'ROOT', 'det', 'npadvmod', 'punct']
|
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
|
||||||
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
|
|
||||||
|
@ -39,17 +41,17 @@ def test_spans_sent_spans(doc):
|
||||||
def test_spans_root(doc):
|
def test_spans_root(doc):
|
||||||
span = doc[2:4]
|
span = doc[2:4]
|
||||||
assert len(span) == 2
|
assert len(span) == 2
|
||||||
assert span.text == 'a sentence'
|
assert span.text == "a sentence"
|
||||||
assert span.root.text == 'sentence'
|
assert span.root.text == "sentence"
|
||||||
assert span.root.head.text == 'is'
|
assert span.root.head.text == "is"
|
||||||
|
|
||||||
|
|
||||||
def test_spans_string_fn(doc):
|
def test_spans_string_fn(doc):
|
||||||
span = doc[0:4]
|
span = doc[0:4]
|
||||||
assert len(span) == 4
|
assert len(span) == 4
|
||||||
assert span.text == 'This is a sentence'
|
assert span.text == "This is a sentence"
|
||||||
assert span.upper_ == 'THIS IS A SENTENCE'
|
assert span.upper_ == "THIS IS A SENTENCE"
|
||||||
assert span.lower_ == 'this is a sentence'
|
assert span.lower_ == "this is a sentence"
|
||||||
|
|
||||||
|
|
||||||
def test_spans_root2(en_tokenizer):
|
def test_spans_root2(en_tokenizer):
|
||||||
|
@ -57,15 +59,15 @@ def test_spans_root2(en_tokenizer):
|
||||||
heads = [0, 3, -1, -2, -4]
|
heads = [0, 3, -1, -2, -4]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||||
assert doc[-2:].root.text == 'Carolina'
|
assert doc[-2:].root.text == "Carolina"
|
||||||
|
|
||||||
|
|
||||||
def test_spans_span_sent(doc, doc_not_parsed):
|
def test_spans_span_sent(doc, doc_not_parsed):
|
||||||
"""Test span.sent property"""
|
"""Test span.sent property"""
|
||||||
assert len(list(doc.sents))
|
assert len(list(doc.sents))
|
||||||
assert doc[:2].sent.root.text == 'is'
|
assert doc[:2].sent.root.text == "is"
|
||||||
assert doc[:2].sent.text == 'This is a sentence .'
|
assert doc[:2].sent.text == "This is a sentence ."
|
||||||
assert doc[6:7].sent.root.left_edge.text == 'This'
|
assert doc[6:7].sent.root.left_edge.text == "This"
|
||||||
# test on manual sbd
|
# test on manual sbd
|
||||||
doc_not_parsed[0].is_sent_start = True
|
doc_not_parsed[0].is_sent_start = True
|
||||||
doc_not_parsed[5].is_sent_start = True
|
doc_not_parsed[5].is_sent_start = True
|
||||||
|
@ -75,23 +77,23 @@ def test_spans_span_sent(doc, doc_not_parsed):
|
||||||
|
|
||||||
def test_spans_lca_matrix(en_tokenizer):
|
def test_spans_lca_matrix(en_tokenizer):
|
||||||
"""Test span's lca matrix generation"""
|
"""Test span's lca matrix generation"""
|
||||||
tokens = en_tokenizer('the lazy dog slept')
|
tokens = en_tokenizer("the lazy dog slept")
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
|
||||||
lca = doc[:2].get_lca_matrix()
|
lca = doc[:2].get_lca_matrix()
|
||||||
assert(lca[0, 0] == 0)
|
assert lca[0, 0] == 0
|
||||||
assert(lca[0, 1] == -1)
|
assert lca[0, 1] == -1
|
||||||
assert(lca[1, 0] == -1)
|
assert lca[1, 0] == -1
|
||||||
assert(lca[1, 1] == 1)
|
assert lca[1, 1] == 1
|
||||||
|
|
||||||
|
|
||||||
def test_span_similarity_match():
|
def test_span_similarity_match():
|
||||||
doc = Doc(Vocab(), words=['a', 'b', 'a', 'b'])
|
doc = Doc(Vocab(), words=["a", "b", "a", "b"])
|
||||||
span1 = doc[:2]
|
span1 = doc[:2]
|
||||||
span2 = doc[2:]
|
span2 = doc[2:]
|
||||||
with pytest.warns(None):
|
with pytest.warns(None):
|
||||||
assert span1.similarity(span2) == 1.0
|
assert span1.similarity(span2) == 1.0
|
||||||
assert span1.similarity(doc) == 0.0
|
assert span1.similarity(doc) == 0.0
|
||||||
assert span1[:1].similarity(doc.vocab['a']) == 1.0
|
assert span1[:1].similarity(doc.vocab["a"]) == 1.0
|
||||||
|
|
||||||
|
|
||||||
def test_spans_default_sentiment(en_tokenizer):
|
def test_spans_default_sentiment(en_tokenizer):
|
||||||
|
@ -102,8 +104,8 @@ def test_spans_default_sentiment(en_tokenizer):
|
||||||
tokens.vocab[tokens[2].text].sentiment = -2.0
|
tokens.vocab[tokens[2].text].sentiment = -2.0
|
||||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
||||||
assert doc[:2].sentiment == 3.0 / 2
|
assert doc[:2].sentiment == 3.0 / 2
|
||||||
assert doc[-2:].sentiment == -2. / 2
|
assert doc[-2:].sentiment == -2.0 / 2
|
||||||
assert doc[:-1].sentiment == (3.+-2) / 3.
|
assert doc[:-1].sentiment == (3.0 + -2) / 3.0
|
||||||
|
|
||||||
|
|
||||||
def test_spans_override_sentiment(en_tokenizer):
|
def test_spans_override_sentiment(en_tokenizer):
|
||||||
|
@ -113,7 +115,7 @@ def test_spans_override_sentiment(en_tokenizer):
|
||||||
tokens.vocab[tokens[0].text].sentiment = 3.0
|
tokens.vocab[tokens[0].text].sentiment = 3.0
|
||||||
tokens.vocab[tokens[2].text].sentiment = -2.0
|
tokens.vocab[tokens[2].text].sentiment = -2.0
|
||||||
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
|
||||||
doc.user_span_hooks['sentiment'] = lambda span: 10.0
|
doc.user_span_hooks["sentiment"] = lambda span: 10.0
|
||||||
assert doc[:2].sentiment == 10.0
|
assert doc[:2].sentiment == 10.0
|
||||||
assert doc[-2:].sentiment == 10.0
|
assert doc[-2:].sentiment == 10.0
|
||||||
assert doc[:-1].sentiment == 10.0
|
assert doc[:-1].sentiment == 10.0
|
||||||
|
@ -132,10 +134,10 @@ def test_spans_are_hashable(en_tokenizer):
|
||||||
|
|
||||||
def test_spans_by_character(doc):
|
def test_spans_by_character(doc):
|
||||||
span1 = doc[1:-2]
|
span1 = doc[1:-2]
|
||||||
span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE')
|
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
|
||||||
assert span1.start_char == span2.start_char
|
assert span1.start_char == span2.start_char
|
||||||
assert span1.end_char == span2.end_char
|
assert span1.end_char == span2.end_char
|
||||||
assert span2.label_ == 'GPE'
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
|
|
||||||
def test_span_to_array(doc):
|
def test_span_to_array(doc):
|
||||||
|
@ -151,12 +153,13 @@ def test_span_as_doc(doc):
|
||||||
span_doc = span.as_doc()
|
span_doc = span.as_doc()
|
||||||
assert span.text == span_doc.text.strip()
|
assert span.text == span_doc.text.strip()
|
||||||
|
|
||||||
|
|
||||||
def test_span_ents_property(doc):
|
def test_span_ents_property(doc):
|
||||||
"""Test span.ents for the """
|
"""Test span.ents for the """
|
||||||
doc.ents = [
|
doc.ents = [
|
||||||
(doc.vocab.strings['PRODUCT'], 0, 1),
|
(doc.vocab.strings["PRODUCT"], 0, 1),
|
||||||
(doc.vocab.strings['PRODUCT'], 7, 8),
|
(doc.vocab.strings["PRODUCT"], 7, 8),
|
||||||
(doc.vocab.strings['PRODUCT'], 11, 14)
|
(doc.vocab.strings["PRODUCT"], 11, 14),
|
||||||
]
|
]
|
||||||
assert len(list(doc.ents)) == 3
|
assert len(list(doc.ents)) == 3
|
||||||
sentences = list(doc.sents)
|
sentences = list(doc.sents)
|
||||||
|
|
|
@ -13,22 +13,23 @@ def test_spans_merge_tokens(en_tokenizer):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].head.text == 'Angeles'
|
assert doc[0].head.text == "Angeles"
|
||||||
assert doc[1].head.text == 'start'
|
assert doc[1].head.text == "start"
|
||||||
doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', ent_type='GPE')
|
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", ent_type="GPE")
|
||||||
assert len(doc) == 3
|
assert len(doc) == 3
|
||||||
assert doc[0].text == 'Los Angeles'
|
assert doc[0].text == "Los Angeles"
|
||||||
assert doc[0].head.text == 'start'
|
assert doc[0].head.text == "start"
|
||||||
|
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].head.text == 'Angeles'
|
assert doc[0].head.text == "Angeles"
|
||||||
assert doc[1].head.text == 'start'
|
assert doc[1].head.text == "start"
|
||||||
doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', label='GPE')
|
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", label="GPE")
|
||||||
assert len(doc) == 3
|
assert len(doc) == 3
|
||||||
assert doc[0].text == 'Los Angeles'
|
assert doc[0].text == "Los Angeles"
|
||||||
assert doc[0].head.text == 'start'
|
assert doc[0].head.text == "start"
|
||||||
assert doc[0].ent_type_ == 'GPE'
|
assert doc[0].ent_type_ == "GPE"
|
||||||
|
|
||||||
|
|
||||||
def test_spans_merge_heads(en_tokenizer):
|
def test_spans_merge_heads(en_tokenizer):
|
||||||
text = "I found a pilates class near work."
|
text = "I found a pilates class near work."
|
||||||
|
@ -37,8 +38,13 @@ def test_spans_merge_heads(en_tokenizer):
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||||
|
|
||||||
assert len(doc) == 8
|
assert len(doc) == 8
|
||||||
doc.merge(doc[3].idx, doc[4].idx + len(doc[4]), tag=doc[4].tag_,
|
doc.merge(
|
||||||
lemma='pilates class', ent_type='O')
|
doc[3].idx,
|
||||||
|
doc[4].idx + len(doc[4]),
|
||||||
|
tag=doc[4].tag_,
|
||||||
|
lemma="pilates class",
|
||||||
|
ent_type="O",
|
||||||
|
)
|
||||||
assert len(doc) == 7
|
assert len(doc) == 7
|
||||||
assert doc[0].head.i == 1
|
assert doc[0].head.i == 1
|
||||||
assert doc[1].head.i == 1
|
assert doc[1].head.i == 1
|
||||||
|
@ -55,8 +61,9 @@ def test_span_np_merges(en_tokenizer):
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||||
|
|
||||||
assert doc[4].head.i == 1
|
assert doc[4].head.i == 1
|
||||||
doc.merge(doc[2].idx, doc[4].idx + len(doc[4]), tag='NP', lemma='tool',
|
doc.merge(
|
||||||
ent_type='O')
|
doc[2].idx, doc[4].idx + len(doc[4]), tag="NP", lemma="tool", ent_type="O"
|
||||||
|
)
|
||||||
assert doc[2].head.i == 1
|
assert doc[2].head.i == 1
|
||||||
|
|
||||||
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
|
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
|
||||||
|
@ -69,7 +76,6 @@ def test_span_np_merges(en_tokenizer):
|
||||||
merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label)
|
merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label)
|
||||||
assert merged != None, (start, end, label, lemma)
|
assert merged != None, (start, end, label, lemma)
|
||||||
|
|
||||||
|
|
||||||
text = "One test with entities like New York City so the ents list is not void"
|
text = "One test with entities like New York City so the ents list is not void"
|
||||||
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
|
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
|
@ -80,15 +86,23 @@ def test_span_np_merges(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
def test_spans_entity_merge(en_tokenizer):
|
def test_spans_entity_merge(en_tokenizer):
|
||||||
|
# fmt: off
|
||||||
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
|
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
|
||||||
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
|
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
|
||||||
tags = ['NNP', 'NNP', 'VBZ', 'DT', 'VB', 'RP', 'NN', 'WP', 'VBZ', 'IN', 'NNP', 'CC', 'VBZ', 'NNP', 'NNP', '.', 'SP']
|
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
|
||||||
ents = [(0, 2, 'PERSON'), (10, 11, 'GPE'), (13, 15, 'PERSON')]
|
ents = [(0, 2, "PERSON"), (10, 11, "GPE"), (13, 15, "PERSON")]
|
||||||
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents)
|
doc = get_doc(
|
||||||
|
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
|
||||||
|
)
|
||||||
assert len(doc) == 17
|
assert len(doc) == 17
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
label, lemma, type_ = (ent.root.tag_, ent.root.lemma_, max(w.ent_type_ for w in ent))
|
label, lemma, type_ = (
|
||||||
|
ent.root.tag_,
|
||||||
|
ent.root.lemma_,
|
||||||
|
max(w.ent_type_ for w in ent),
|
||||||
|
)
|
||||||
ent.merge(label=label, lemma=lemma, ent_type=type_)
|
ent.merge(label=label, lemma=lemma, ent_type=type_)
|
||||||
# check looping is ok
|
# check looping is ok
|
||||||
assert len(doc) == 15
|
assert len(doc) == 15
|
||||||
|
@ -98,8 +112,10 @@ def test_spans_entity_merge_iob():
|
||||||
# Test entity IOB stays consistent after merging
|
# Test entity IOB stays consistent after merging
|
||||||
words = ["a", "b", "c", "d", "e"]
|
words = ["a", "b", "c", "d", "e"]
|
||||||
doc = Doc(Vocab(), words=words)
|
doc = Doc(Vocab(), words=words)
|
||||||
doc.ents = [(doc.vocab.strings.add('ent-abc'), 0, 3),
|
doc.ents = [
|
||||||
(doc.vocab.strings.add('ent-d'), 3, 4)]
|
(doc.vocab.strings.add("ent-abc"), 0, 3),
|
||||||
|
(doc.vocab.strings.add("ent-d"), 3, 4),
|
||||||
|
]
|
||||||
assert doc[0].ent_iob_ == "B"
|
assert doc[0].ent_iob_ == "B"
|
||||||
assert doc[1].ent_iob_ == "I"
|
assert doc[1].ent_iob_ == "I"
|
||||||
assert doc[2].ent_iob_ == "I"
|
assert doc[2].ent_iob_ == "I"
|
||||||
|
@ -110,33 +126,37 @@ def test_spans_entity_merge_iob():
|
||||||
|
|
||||||
|
|
||||||
def test_spans_sentence_update_after_merge(en_tokenizer):
|
def test_spans_sentence_update_after_merge(en_tokenizer):
|
||||||
|
# fmt: off
|
||||||
text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
|
text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
|
||||||
heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
|
heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
|
||||||
deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
|
deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
|
||||||
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
|
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
|
||||||
'compound', 'dobj', 'punct']
|
'compound', 'dobj', 'punct']
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
sent1, sent2 = list(doc.sents)
|
sent1, sent2 = list(doc.sents)
|
||||||
init_len = len(sent1)
|
init_len = len(sent1)
|
||||||
init_len2 = len(sent2)
|
init_len2 = len(sent2)
|
||||||
doc[0:2].merge(label='none', lemma='none', ent_type='none')
|
doc[0:2].merge(label="none", lemma="none", ent_type="none")
|
||||||
doc[-2:].merge(label='none', lemma='none', ent_type='none')
|
doc[-2:].merge(label="none", lemma="none", ent_type="none")
|
||||||
assert len(sent1) == init_len - 1
|
assert len(sent1) == init_len - 1
|
||||||
assert len(sent2) == init_len2 - 1
|
assert len(sent2) == init_len2 - 1
|
||||||
|
|
||||||
|
|
||||||
def test_spans_subtree_size_check(en_tokenizer):
|
def test_spans_subtree_size_check(en_tokenizer):
|
||||||
|
# fmt: off
|
||||||
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
|
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
|
||||||
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
|
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
|
||||||
deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
|
deps = ["compound", "nsubj", "ROOT", "det", "amod", "prt", "attr",
|
||||||
'nsubj', 'relcl', 'prep', 'pobj', 'cc', 'conj', 'compound',
|
"nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
|
||||||
'dobj']
|
"dobj"]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
sent1 = list(doc.sents)[0]
|
sent1 = list(doc.sents)[0]
|
||||||
init_len = len(list(sent1.root.subtree))
|
init_len = len(list(sent1.root.subtree))
|
||||||
doc[0:2].merge(label='none', lemma='none', ent_type='none')
|
doc[0:2].merge(label="none", lemma="none", ent_type="none")
|
||||||
assert len(list(sent1.root.subtree)) == init_len - 1
|
assert len(list(sent1.root.subtree)) == init_len - 1
|
||||||
|
|
|
@ -13,31 +13,35 @@ from ..util import get_doc
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def doc(en_tokenizer):
|
def doc(en_tokenizer):
|
||||||
|
# fmt: off
|
||||||
text = "This is a sentence. This is another sentence. And a third."
|
text = "This is a sentence. This is another sentence. And a third."
|
||||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
|
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
|
||||||
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
|
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||||
'attr', 'punct', 'ROOT', 'det', 'npadvmod', 'punct']
|
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
|
||||||
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
|
|
||||||
|
|
||||||
def test_doc_token_api_strings(en_tokenizer):
|
def test_doc_token_api_strings(en_tokenizer):
|
||||||
text = "Give it back! He pleaded."
|
text = "Give it back! He pleaded."
|
||||||
pos = ['VERB', 'PRON', 'PART', 'PUNCT', 'PRON', 'VERB', 'PUNCT']
|
pos = ["VERB", "PRON", "PART", "PUNCT", "PRON", "VERB", "PUNCT"]
|
||||||
heads = [0, -1, -2, -3, 1, 0, -1]
|
heads = [0, -1, -2, -3, 1, 0, -1]
|
||||||
deps = ['ROOT', 'dobj', 'prt', 'punct', 'nsubj', 'ROOT', 'punct']
|
deps = ["ROOT", "dobj", "prt", "punct", "nsubj", "ROOT", "punct"]
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps)
|
doc = get_doc(
|
||||||
assert doc[0].orth_ == 'Give'
|
tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
|
||||||
assert doc[0].text == 'Give'
|
)
|
||||||
assert doc[0].text_with_ws == 'Give '
|
assert doc[0].orth_ == "Give"
|
||||||
assert doc[0].lower_ == 'give'
|
assert doc[0].text == "Give"
|
||||||
assert doc[0].shape_ == 'Xxxx'
|
assert doc[0].text_with_ws == "Give "
|
||||||
assert doc[0].prefix_ == 'G'
|
assert doc[0].lower_ == "give"
|
||||||
assert doc[0].suffix_ == 'ive'
|
assert doc[0].shape_ == "Xxxx"
|
||||||
assert doc[0].pos_ == 'VERB'
|
assert doc[0].prefix_ == "G"
|
||||||
assert doc[0].dep_ == 'ROOT'
|
assert doc[0].suffix_ == "ive"
|
||||||
|
assert doc[0].pos_ == "VERB"
|
||||||
|
assert doc[0].dep_ == "ROOT"
|
||||||
|
|
||||||
|
|
||||||
def test_doc_token_api_flags(en_tokenizer):
|
def test_doc_token_api_flags(en_tokenizer):
|
||||||
|
@ -53,7 +57,7 @@ def test_doc_token_api_flags(en_tokenizer):
|
||||||
# TODO: Test more of these, esp. if a bug is found
|
# TODO: Test more of these, esp. if a bug is found
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Give it back! He pleaded."])
|
@pytest.mark.parametrize("text", ["Give it back! He pleaded."])
|
||||||
def test_doc_token_api_prob_inherited_from_vocab(en_tokenizer, text):
|
def test_doc_token_api_prob_inherited_from_vocab(en_tokenizer, text):
|
||||||
word = text.split()[0]
|
word = text.split()[0]
|
||||||
en_tokenizer.vocab[word].prob = -1
|
en_tokenizer.vocab[word].prob = -1
|
||||||
|
@ -61,11 +65,11 @@ def test_doc_token_api_prob_inherited_from_vocab(en_tokenizer, text):
|
||||||
assert tokens[0].prob != 0
|
assert tokens[0].prob != 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["one two"])
|
@pytest.mark.parametrize("text", ["one two"])
|
||||||
def test_doc_token_api_str_builtin(en_tokenizer, text):
|
def test_doc_token_api_str_builtin(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert str(tokens[0]) == text.split(' ')[0]
|
assert str(tokens[0]) == text.split(" ")[0]
|
||||||
assert str(tokens[1]) == text.split(' ')[1]
|
assert str(tokens[1]) == text.split(" ")[1]
|
||||||
|
|
||||||
|
|
||||||
def test_doc_token_api_is_properties(en_vocab):
|
def test_doc_token_api_is_properties(en_vocab):
|
||||||
|
@ -83,16 +87,16 @@ def test_doc_token_api_is_properties(en_vocab):
|
||||||
def test_doc_token_api_vectors():
|
def test_doc_token_api_vectors():
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
vocab.reset_vectors(width=2)
|
vocab.reset_vectors(width=2)
|
||||||
vocab.set_vector('apples', vector=numpy.asarray([0., 2.], dtype='f'))
|
vocab.set_vector("apples", vector=numpy.asarray([0.0, 2.0], dtype="f"))
|
||||||
vocab.set_vector('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
|
vocab.set_vector("oranges", vector=numpy.asarray([0.0, 1.0], dtype="f"))
|
||||||
doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
|
doc = Doc(vocab, words=["apples", "oranges", "oov"])
|
||||||
assert doc.has_vector
|
assert doc.has_vector
|
||||||
assert doc[0].has_vector
|
assert doc[0].has_vector
|
||||||
assert doc[1].has_vector
|
assert doc[1].has_vector
|
||||||
assert not doc[2].has_vector
|
assert not doc[2].has_vector
|
||||||
apples_norm = (0*0 + 2*2) ** 0.5
|
apples_norm = (0 * 0 + 2 * 2) ** 0.5
|
||||||
oranges_norm = (0*0 + 1*1) ** 0.5
|
oranges_norm = (0 * 0 + 1 * 1) ** 0.5
|
||||||
cosine = ((0*0) + (2*1)) / (apples_norm * oranges_norm)
|
cosine = ((0 * 0) + (2 * 1)) / (apples_norm * oranges_norm)
|
||||||
assert doc[0].similarity(doc[1]) == cosine
|
assert doc[0].similarity(doc[1]) == cosine
|
||||||
|
|
||||||
|
|
||||||
|
@ -165,7 +169,7 @@ def test_doc_token_api_head_setter(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
def test_is_sent_start(en_tokenizer):
|
def test_is_sent_start(en_tokenizer):
|
||||||
doc = en_tokenizer('This is a sentence. This is another.')
|
doc = en_tokenizer("This is a sentence. This is another.")
|
||||||
assert doc[5].is_sent_start is None
|
assert doc[5].is_sent_start is None
|
||||||
doc[5].is_sent_start = True
|
doc[5].is_sent_start = True
|
||||||
assert doc[5].is_sent_start is True
|
assert doc[5].is_sent_start is True
|
||||||
|
@ -174,17 +178,17 @@ def test_is_sent_start(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
def test_set_pos():
|
def test_set_pos():
|
||||||
doc = Doc(Vocab(), words=['hello', 'world'])
|
doc = Doc(Vocab(), words=["hello", "world"])
|
||||||
doc[0].pos_ = 'NOUN'
|
doc[0].pos_ = "NOUN"
|
||||||
assert doc[0].pos_ == 'NOUN'
|
assert doc[0].pos_ == "NOUN"
|
||||||
doc[1].pos = VERB
|
doc[1].pos = VERB
|
||||||
assert doc[1].pos_ == 'VERB'
|
assert doc[1].pos_ == "VERB"
|
||||||
|
|
||||||
|
|
||||||
def test_tokens_sent(doc):
|
def test_tokens_sent(doc):
|
||||||
"""Test token.sent property"""
|
"""Test token.sent property"""
|
||||||
assert len(list(doc.sents)) == 3
|
assert len(list(doc.sents)) == 3
|
||||||
assert doc[1].sent.text == 'This is a sentence .'
|
assert doc[1].sent.text == "This is a sentence ."
|
||||||
assert doc[7].sent.text == 'This is another sentence .'
|
assert doc[7].sent.text == "This is another sentence ."
|
||||||
assert doc[1].sent.root.left_edge.text == 'This'
|
assert doc[1].sent.root.left_edge.text == "This"
|
||||||
assert doc[7].sent.root.left_edge.text == 'This'
|
assert doc[7].sent.root.left_edge.text == "This"
|
||||||
|
|
|
@ -20,7 +20,7 @@ def test_doc_underscore_getattr_setattr():
|
||||||
doc = Mock()
|
doc = Mock()
|
||||||
doc.doc = doc
|
doc.doc = doc
|
||||||
doc.user_data = {}
|
doc.user_data = {}
|
||||||
Underscore.doc_extensions['hello'] = (False, None, None, None)
|
Underscore.doc_extensions["hello"] = (False, None, None, None)
|
||||||
doc._ = Underscore(Underscore.doc_extensions, doc)
|
doc._ = Underscore(Underscore.doc_extensions, doc)
|
||||||
assert doc._.hello == False
|
assert doc._.hello == False
|
||||||
doc._.hello = True
|
doc._.hello = True
|
||||||
|
@ -29,8 +29,9 @@ def test_doc_underscore_getattr_setattr():
|
||||||
|
|
||||||
def test_create_span_underscore():
|
def test_create_span_underscore():
|
||||||
span = Mock(doc=Mock(), start=0, end=2)
|
span = Mock(doc=Mock(), start=0, end=2)
|
||||||
uscore = Underscore(Underscore.span_extensions, span,
|
uscore = Underscore(
|
||||||
start=span.start, end=span.end)
|
Underscore.span_extensions, span, start=span.start, end=span.end
|
||||||
|
)
|
||||||
assert uscore._doc is span.doc
|
assert uscore._doc is span.doc
|
||||||
assert uscore._start is span.start
|
assert uscore._start is span.start
|
||||||
assert uscore._end is span.end
|
assert uscore._end is span.end
|
||||||
|
@ -38,60 +39,70 @@ def test_create_span_underscore():
|
||||||
|
|
||||||
def test_span_underscore_getter_setter():
|
def test_span_underscore_getter_setter():
|
||||||
span = Mock(doc=Mock(), start=0, end=2)
|
span = Mock(doc=Mock(), start=0, end=2)
|
||||||
Underscore.span_extensions['hello'] = (None, None,
|
Underscore.span_extensions["hello"] = (
|
||||||
lambda s: (s.start, 'hi'),
|
None,
|
||||||
lambda s, value: setattr(s, 'start',
|
None,
|
||||||
value))
|
lambda s: (s.start, "hi"),
|
||||||
span._ = Underscore(Underscore.span_extensions, span,
|
lambda s, value: setattr(s, "start", value),
|
||||||
start=span.start, end=span.end)
|
)
|
||||||
|
span._ = Underscore(
|
||||||
|
Underscore.span_extensions, span, start=span.start, end=span.end
|
||||||
|
)
|
||||||
|
|
||||||
assert span._.hello == (0, 'hi')
|
assert span._.hello == (0, "hi")
|
||||||
span._.hello = 1
|
span._.hello = 1
|
||||||
assert span._.hello == (1, 'hi')
|
assert span._.hello == (1, "hi")
|
||||||
|
|
||||||
|
|
||||||
def test_token_underscore_method():
|
def test_token_underscore_method():
|
||||||
token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese')
|
token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: "cheese")
|
||||||
Underscore.token_extensions['hello'] = (None, token.say_cheese,
|
Underscore.token_extensions["hello"] = (None, token.say_cheese, None, None)
|
||||||
None, None)
|
|
||||||
token._ = Underscore(Underscore.token_extensions, token, start=token.idx)
|
token._ = Underscore(Underscore.token_extensions, token, start=token.idx)
|
||||||
assert token._.hello() == 'cheese'
|
assert token._.hello() == "cheese"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('obj', [Doc, Span, Token])
|
@pytest.mark.parametrize("obj", [Doc, Span, Token])
|
||||||
def test_doc_underscore_remove_extension(obj):
|
def test_doc_underscore_remove_extension(obj):
|
||||||
ext_name = 'to_be_removed'
|
ext_name = "to_be_removed"
|
||||||
obj.set_extension(ext_name, default=False)
|
obj.set_extension(ext_name, default=False)
|
||||||
assert obj.has_extension(ext_name)
|
assert obj.has_extension(ext_name)
|
||||||
obj.remove_extension(ext_name)
|
obj.remove_extension(ext_name)
|
||||||
assert not obj.has_extension(ext_name)
|
assert not obj.has_extension(ext_name)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('obj', [Doc, Span, Token])
|
@pytest.mark.parametrize("obj", [Doc, Span, Token])
|
||||||
def test_underscore_raises_for_dup(obj):
|
def test_underscore_raises_for_dup(obj):
|
||||||
obj.set_extension('test', default=None)
|
obj.set_extension("test", default=None)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
obj.set_extension('test', default=None)
|
obj.set_extension("test", default=None)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('invalid_kwargs', [
|
@pytest.mark.parametrize(
|
||||||
{'getter': None, 'setter': lambda: None},
|
"invalid_kwargs",
|
||||||
{'default': None, 'method': lambda: None, 'getter': lambda: None},
|
[
|
||||||
{'setter': lambda: None},
|
{"getter": None, "setter": lambda: None},
|
||||||
{'default': None, 'method': lambda: None},
|
{"default": None, "method": lambda: None, "getter": lambda: None},
|
||||||
{'getter': True}])
|
{"setter": lambda: None},
|
||||||
|
{"default": None, "method": lambda: None},
|
||||||
|
{"getter": True},
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_underscore_raises_for_invalid(invalid_kwargs):
|
def test_underscore_raises_for_invalid(invalid_kwargs):
|
||||||
invalid_kwargs['force'] = True
|
invalid_kwargs["force"] = True
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
Doc.set_extension('test', **invalid_kwargs)
|
Doc.set_extension("test", **invalid_kwargs)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('valid_kwargs', [
|
@pytest.mark.parametrize(
|
||||||
{'getter': lambda: None},
|
"valid_kwargs",
|
||||||
{'getter': lambda: None, 'setter': lambda: None},
|
[
|
||||||
{'default': 'hello'},
|
{"getter": lambda: None},
|
||||||
{'default': None},
|
{"getter": lambda: None, "setter": lambda: None},
|
||||||
{'method': lambda: None}])
|
{"default": "hello"},
|
||||||
|
{"default": None},
|
||||||
|
{"method": lambda: None},
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_underscore_accepts_valid(valid_kwargs):
|
def test_underscore_accepts_valid(valid_kwargs):
|
||||||
valid_kwargs['force'] = True
|
valid_kwargs["force"] = True
|
||||||
Doc.set_extension('test', **valid_kwargs)
|
Doc.set_extension("test", **valid_kwargs)
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["ق.م", "إلخ", "ص.ب", "ت."])
|
@pytest.mark.parametrize("text", ["ق.م", "إلخ", "ص.ب", "ت."])
|
||||||
def test_ar_tokenizer_handles_abbr(ar_tokenizer, text):
|
def test_ar_tokenizer_handles_abbr(ar_tokenizer, text):
|
||||||
tokens = ar_tokenizer(text)
|
tokens = ar_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
@ -18,7 +18,7 @@ def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer):
|
||||||
assert tokens[6].lemma_ == "قبل الميلاد"
|
assert tokens[6].lemma_ == "قبل الميلاد"
|
||||||
|
|
||||||
|
|
||||||
def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer):
|
def test_ar_tokenizer_handles_exc_in_text_2(ar_tokenizer):
|
||||||
text = "يبلغ طول مضيق طارق 14كم "
|
text = "يبلغ طول مضيق طارق 14كم "
|
||||||
tokens = ar_tokenizer(text)
|
tokens = ar_tokenizer(text)
|
||||||
assert len(tokens) == 6
|
assert len(tokens) == 6
|
||||||
|
|
|
@ -6,16 +6,22 @@ import pytest
|
||||||
|
|
||||||
TESTCASES = [
|
TESTCASES = [
|
||||||
# punctuation tests
|
# punctuation tests
|
||||||
('আমি বাংলায় গান গাই!', ['আমি', 'বাংলায়', 'গান', 'গাই', '!']),
|
("আমি বাংলায় গান গাই!", ["আমি", "বাংলায়", "গান", "গাই", "!"]),
|
||||||
('আমি বাংলায় কথা কই।', ['আমি', 'বাংলায়', 'কথা', 'কই', '।']),
|
("আমি বাংলায় কথা কই।", ["আমি", "বাংলায়", "কথা", "কই", "।"]),
|
||||||
('বসুন্ধরা জনসম্মুখে দোষ স্বীকার করলো না?', ['বসুন্ধরা', 'জনসম্মুখে', 'দোষ', 'স্বীকার', 'করলো', 'না', '?']),
|
(
|
||||||
('টাকা থাকলে কি না হয়!', ['টাকা', 'থাকলে', 'কি', 'না', 'হয়', '!']),
|
"বসুন্ধরা জনসম্মুখে দোষ স্বীকার করলো না?",
|
||||||
|
["বসুন্ধরা", "জনসম্মুখে", "দোষ", "স্বীকার", "করলো", "না", "?"],
|
||||||
|
),
|
||||||
|
("টাকা থাকলে কি না হয়!", ["টাকা", "থাকলে", "কি", "না", "হয়", "!"]),
|
||||||
# abbreviations
|
# abbreviations
|
||||||
('ডঃ খালেদ বললেন ঢাকায় ৩৫ ডিগ্রি সে.।', ['ডঃ', 'খালেদ', 'বললেন', 'ঢাকায়', '৩৫', 'ডিগ্রি', 'সে.', '।'])
|
(
|
||||||
|
"ডঃ খালেদ বললেন ঢাকায় ৩৫ ডিগ্রি সে.।",
|
||||||
|
["ডঃ", "খালেদ", "বললেন", "ঢাকায়", "৩৫", "ডিগ্রি", "সে.", "।"],
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
|
@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
|
||||||
def test_bn_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens):
|
def test_bn_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens):
|
||||||
tokens = bn_tokenizer(text)
|
tokens = bn_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
|
|
@ -4,19 +4,19 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["ca.", "m.a.o.", "Jan.", "Dec.", "kr.", "jf."])
|
@pytest.mark.parametrize("text", ["ca.", "m.a.o.", "Jan.", "Dec.", "kr.", "jf."])
|
||||||
def test_da_tokenizer_handles_abbr(da_tokenizer, text):
|
def test_da_tokenizer_handles_abbr(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Jul.", "jul.", "Tor.", "Tors."])
|
@pytest.mark.parametrize("text", ["Jul.", "jul.", "Tor.", "Tors."])
|
||||||
def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text):
|
def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["1.", "10.", "31."])
|
@pytest.mark.parametrize("text", ["1.", "10.", "31."])
|
||||||
def test_da_tokenizer_handles_dates(da_tokenizer, text):
|
def test_da_tokenizer_handles_dates(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
@ -37,8 +37,9 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
|
||||||
assert tokens[7].text == "."
|
assert tokens[7].text == "."
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,norm', [
|
@pytest.mark.parametrize(
|
||||||
("akvarium", "akvarie"), ("bedstemoder", "bedstemor")])
|
"text,norm", [("akvarium", "akvarie"), ("bedstemoder", "bedstemor")]
|
||||||
|
)
|
||||||
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
|
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert tokens[0].norm_ == norm
|
assert tokens[0].norm_ == norm
|
||||||
|
|
|
@ -4,11 +4,15 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string,lemma', [
|
@pytest.mark.parametrize(
|
||||||
('affaldsgruppernes', 'affaldsgruppe'),
|
"string,lemma",
|
||||||
('detailhandelsstrukturernes', 'detailhandelsstruktur'),
|
[
|
||||||
('kolesterols', 'kolesterol'),
|
("affaldsgruppernes", "affaldsgruppe"),
|
||||||
('åsyns', 'åsyn')])
|
("detailhandelsstrukturernes", "detailhandelsstruktur"),
|
||||||
|
("kolesterols", "kolesterol"),
|
||||||
|
("åsyns", "åsyn"),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_da_lemmatizer_lookup_assigns(da_tokenizer, string, lemma):
|
def test_da_lemmatizer_lookup_assigns(da_tokenizer, string, lemma):
|
||||||
tokens = da_tokenizer(string)
|
tokens = da_tokenizer(string)
|
||||||
assert tokens[0].lemma_ == lemma
|
assert tokens[0].lemma_ == lemma
|
||||||
|
|
|
@ -4,19 +4,19 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(under)"])
|
@pytest.mark.parametrize("text", ["(under)"])
|
||||||
def test_da_tokenizer_splits_no_special(da_tokenizer, text):
|
def test_da_tokenizer_splits_no_special(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["ta'r", "Søren's", "Lars'"])
|
@pytest.mark.parametrize("text", ["ta'r", "Søren's", "Lars'"])
|
||||||
def test_da_tokenizer_handles_no_punct(da_tokenizer, text):
|
def test_da_tokenizer_handles_no_punct(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(ta'r"])
|
@pytest.mark.parametrize("text", ["(ta'r"])
|
||||||
def test_da_tokenizer_splits_prefix_punct(da_tokenizer, text):
|
def test_da_tokenizer_splits_prefix_punct(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -24,7 +24,7 @@ def test_da_tokenizer_splits_prefix_punct(da_tokenizer, text):
|
||||||
assert tokens[1].text == "ta'r"
|
assert tokens[1].text == "ta'r"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["ta'r)"])
|
@pytest.mark.parametrize("text", ["ta'r)"])
|
||||||
def test_da_tokenizer_splits_suffix_punct(da_tokenizer, text):
|
def test_da_tokenizer_splits_suffix_punct(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -32,15 +32,16 @@ def test_da_tokenizer_splits_suffix_punct(da_tokenizer, text):
|
||||||
assert tokens[1].text == ")"
|
assert tokens[1].text == ")"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected', [
|
@pytest.mark.parametrize(
|
||||||
("(ta'r)", ["(", "ta'r", ")"]), ("'ta'r'", ["'", "ta'r", "'"])])
|
"text,expected", [("(ta'r)", ["(", "ta'r", ")"]), ("'ta'r'", ["'", "ta'r", "'"])]
|
||||||
|
)
|
||||||
def test_da_tokenizer_splits_even_wrap(da_tokenizer, text, expected):
|
def test_da_tokenizer_splits_even_wrap(da_tokenizer, text, expected):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == len(expected)
|
assert len(tokens) == len(expected)
|
||||||
assert [t.text for t in tokens] == expected
|
assert [t.text for t in tokens] == expected
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(ta'r?)"])
|
@pytest.mark.parametrize("text", ["(ta'r?)"])
|
||||||
def test_da_tokenizer_splits_uneven_wrap(da_tokenizer, text):
|
def test_da_tokenizer_splits_uneven_wrap(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
@ -50,15 +51,17 @@ def test_da_tokenizer_splits_uneven_wrap(da_tokenizer, text):
|
||||||
assert tokens[3].text == ")"
|
assert tokens[3].text == ")"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected', [
|
@pytest.mark.parametrize(
|
||||||
("f.eks.", ["f.eks."]), ("fe.", ["fe", "."]), ("(f.eks.", ["(", "f.eks."])])
|
"text,expected",
|
||||||
|
[("f.eks.", ["f.eks."]), ("fe.", ["fe", "."]), ("(f.eks.", ["(", "f.eks."])],
|
||||||
|
)
|
||||||
def test_da_tokenizer_splits_prefix_interact(da_tokenizer, text, expected):
|
def test_da_tokenizer_splits_prefix_interact(da_tokenizer, text, expected):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == len(expected)
|
assert len(tokens) == len(expected)
|
||||||
assert [t.text for t in tokens] == expected
|
assert [t.text for t in tokens] == expected
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["f.eks.)"])
|
@pytest.mark.parametrize("text", ["f.eks.)"])
|
||||||
def test_da_tokenizer_splits_suffix_interact(da_tokenizer, text):
|
def test_da_tokenizer_splits_suffix_interact(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -66,7 +69,7 @@ def test_da_tokenizer_splits_suffix_interact(da_tokenizer, text):
|
||||||
assert tokens[1].text == ")"
|
assert tokens[1].text == ")"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(f.eks.)"])
|
@pytest.mark.parametrize("text", ["(f.eks.)"])
|
||||||
def test_da_tokenizer_splits_even_wrap_interact(da_tokenizer, text):
|
def test_da_tokenizer_splits_even_wrap_interact(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -75,7 +78,7 @@ def test_da_tokenizer_splits_even_wrap_interact(da_tokenizer, text):
|
||||||
assert tokens[2].text == ")"
|
assert tokens[2].text == ")"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(f.eks.?)"])
|
@pytest.mark.parametrize("text", ["(f.eks.?)"])
|
||||||
def test_da_tokenizer_splits_uneven_wrap_interact(da_tokenizer, text):
|
def test_da_tokenizer_splits_uneven_wrap_interact(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
@ -85,19 +88,19 @@ def test_da_tokenizer_splits_uneven_wrap_interact(da_tokenizer, text):
|
||||||
assert tokens[3].text == ")"
|
assert tokens[3].text == ")"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["0,1-13,5", "0,0-0,1", "103,27-300", "1/2-3/4"])
|
@pytest.mark.parametrize("text", ["0,1-13,5", "0,0-0,1", "103,27-300", "1/2-3/4"])
|
||||||
def test_da_tokenizer_handles_numeric_range(da_tokenizer, text):
|
def test_da_tokenizer_handles_numeric_range(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["sort.Gul", "Hej.Verden"])
|
@pytest.mark.parametrize("text", ["sort.Gul", "Hej.Verden"])
|
||||||
def test_da_tokenizer_splits_period_infix(da_tokenizer, text):
|
def test_da_tokenizer_splits_period_infix(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Hej,Verden", "en,to"])
|
@pytest.mark.parametrize("text", ["Hej,Verden", "en,to"])
|
||||||
def test_da_tokenizer_splits_comma_infix(da_tokenizer, text):
|
def test_da_tokenizer_splits_comma_infix(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -106,20 +109,25 @@ def test_da_tokenizer_splits_comma_infix(da_tokenizer, text):
|
||||||
assert tokens[2].text == text.split(",")[1]
|
assert tokens[2].text == text.split(",")[1]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["sort...Gul", "sort...gul"])
|
@pytest.mark.parametrize("text", ["sort...Gul", "sort...gul"])
|
||||||
def test_da_tokenizer_splits_ellipsis_infix(da_tokenizer, text):
|
def test_da_tokenizer_splits_ellipsis_infix(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ['gå-på-mod', '4-hjulstræk', '100-Pfennig-frimærke', 'TV-2-spots', 'trofæ-vaeggen'])
|
@pytest.mark.parametrize(
|
||||||
|
"text",
|
||||||
|
["gå-på-mod", "4-hjulstræk", "100-Pfennig-frimærke", "TV-2-spots", "trofæ-vaeggen"],
|
||||||
|
)
|
||||||
def test_da_tokenizer_keeps_hyphens(da_tokenizer, text):
|
def test_da_tokenizer_keeps_hyphens(da_tokenizer, text):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_da_tokenizer_splits_double_hyphen_infix(da_tokenizer):
|
def test_da_tokenizer_splits_double_hyphen_infix(da_tokenizer):
|
||||||
tokens = da_tokenizer("Mange regler--eksempelvis bindestregs-reglerne--er komplicerede.")
|
tokens = da_tokenizer(
|
||||||
|
"Mange regler--eksempelvis bindestregs-reglerne--er komplicerede."
|
||||||
|
)
|
||||||
assert len(tokens) == 9
|
assert len(tokens) == 9
|
||||||
assert tokens[0].text == "Mange"
|
assert tokens[0].text == "Mange"
|
||||||
assert tokens[1].text == "regler"
|
assert tokens[1].text == "regler"
|
||||||
|
@ -132,7 +140,9 @@ def test_da_tokenizer_splits_double_hyphen_infix(da_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
def test_da_tokenizer_handles_posessives_and_contractions(da_tokenizer):
|
def test_da_tokenizer_handles_posessives_and_contractions(da_tokenizer):
|
||||||
tokens = da_tokenizer("'DBA's, Lars' og Liz' bil sku' sgu' ik' ha' en bule, det ka' han ik' li' mere', sagde hun.")
|
tokens = da_tokenizer(
|
||||||
|
"'DBA's, Lars' og Liz' bil sku' sgu' ik' ha' en bule, det ka' han ik' li' mere', sagde hun."
|
||||||
|
)
|
||||||
assert len(tokens) == 25
|
assert len(tokens) == 25
|
||||||
assert tokens[0].text == "'"
|
assert tokens[0].text == "'"
|
||||||
assert tokens[1].text == "DBA's"
|
assert tokens[1].text == "DBA's"
|
||||||
|
|
|
@ -15,17 +15,29 @@ Rundt om ager og eng var der store skove, og midt i skovene dybe søer; jo, der
|
||||||
assert len(tokens) == 84
|
assert len(tokens) == 84
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,match', [
|
@pytest.mark.parametrize(
|
||||||
('10', True), ('1', True), ('10.000', True), ('10.00', True),
|
"text,match",
|
||||||
('999,0', True), ('en', True), ('treoghalvfemsindstyvende', True), ('hundrede', True),
|
[
|
||||||
('hund', False), (',', False), ('1/2', True)])
|
("10", True),
|
||||||
|
("1", True),
|
||||||
|
("10.000", True),
|
||||||
|
("10.00", True),
|
||||||
|
("999,0", True),
|
||||||
|
("en", True),
|
||||||
|
("treoghalvfemsindstyvende", True),
|
||||||
|
("hundrede", True),
|
||||||
|
("hund", False),
|
||||||
|
(",", False),
|
||||||
|
("1/2", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_lex_attrs_like_number(da_tokenizer, text, match):
|
def test_lex_attrs_like_number(da_tokenizer, text, match):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
assert tokens[0].like_num == match
|
assert tokens[0].like_num == match
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('word', ['elleve', 'første'])
|
@pytest.mark.parametrize("word", ["elleve", "første"])
|
||||||
def test_da_lex_attrs_capitals(word):
|
def test_da_lex_attrs_capitals(word):
|
||||||
assert like_num(word)
|
assert like_num(word)
|
||||||
assert like_num(word.upper())
|
assert like_num(word.upper())
|
||||||
|
|
|
@ -4,13 +4,13 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
|
@pytest.mark.parametrize("text", ["auf'm", "du's", "über'm", "wir's"])
|
||||||
def test_de_tokenizer_splits_contractions(de_tokenizer, text):
|
def test_de_tokenizer_splits_contractions(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
@pytest.mark.parametrize("text", ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
|
||||||
def test_de_tokenizer_handles_abbr(de_tokenizer, text):
|
def test_de_tokenizer_handles_abbr(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
@ -24,14 +24,16 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
|
||||||
assert tokens[2].lemma_ == "zur Zeit"
|
assert tokens[2].lemma_ == "zur Zeit"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
|
@pytest.mark.parametrize(
|
||||||
|
"text,norms", [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])]
|
||||||
|
)
|
||||||
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
|
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert [token.norm_ for token in tokens] == norms
|
assert [token.norm_ for token in tokens] == norms
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
|
@pytest.mark.parametrize("text,norm", [("daß", "dass")])
|
||||||
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert tokens[0].norm_ == norm
|
assert tokens[0].norm_ == norm
|
||||||
|
|
|
@ -4,13 +4,17 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string,lemma', [
|
@pytest.mark.parametrize(
|
||||||
('Abgehängten', 'Abgehängte'),
|
"string,lemma",
|
||||||
('engagierte', 'engagieren'),
|
[
|
||||||
('schließt', 'schließen'),
|
("Abgehängten", "Abgehängte"),
|
||||||
('vorgebenden', 'vorgebend'),
|
("engagierte", "engagieren"),
|
||||||
('die', 'der'),
|
("schließt", "schließen"),
|
||||||
('Die', 'der')])
|
("vorgebenden", "vorgebend"),
|
||||||
|
("die", "der"),
|
||||||
|
("Die", "der"),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_de_lemmatizer_lookup_assigns(de_tokenizer, string, lemma):
|
def test_de_lemmatizer_lookup_assigns(de_tokenizer, string, lemma):
|
||||||
tokens = de_tokenizer(string)
|
tokens = de_tokenizer(string)
|
||||||
assert tokens[0].lemma_ == lemma
|
assert tokens[0].lemma_ == lemma
|
||||||
|
|
|
@ -7,10 +7,12 @@ from ...util import get_doc
|
||||||
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
|
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
|
||||||
text = "Eine Tasse steht auf dem Tisch."
|
text = "Eine Tasse steht auf dem Tisch."
|
||||||
heads = [1, 1, 0, -1, 1, -2, -4]
|
heads = [1, 1, 0, -1, 1, -2, -4]
|
||||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
|
tags = ["ART", "NN", "VVFIN", "APPR", "ART", "NN", "$."]
|
||||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
|
deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "punct"]
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
doc = get_doc(
|
||||||
|
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||||
|
)
|
||||||
chunks = list(doc.noun_chunks)
|
chunks = list(doc.noun_chunks)
|
||||||
assert len(chunks) == 2
|
assert len(chunks) == 2
|
||||||
assert chunks[0].text_with_ws == "Eine Tasse "
|
assert chunks[0].text_with_ws == "Eine Tasse "
|
||||||
|
@ -20,10 +22,12 @@ def test_de_parser_noun_chunks_standard_de(de_tokenizer):
|
||||||
def test_de_extended_chunk(de_tokenizer):
|
def test_de_extended_chunk(de_tokenizer):
|
||||||
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
|
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
|
||||||
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
|
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
|
||||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
|
tags = ["ART", "NN", "VVFIN", "APPR", "ART", "NN", "NN", "NN", "$."]
|
||||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
|
deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "nk", "oa", "punct"]
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
doc = get_doc(
|
||||||
|
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||||
|
)
|
||||||
chunks = list(doc.noun_chunks)
|
chunks = list(doc.noun_chunks)
|
||||||
assert len(chunks) == 3
|
assert len(chunks) == 3
|
||||||
assert chunks[0].text_with_ws == "Die Sängerin "
|
assert chunks[0].text_with_ws == "Die Sängerin "
|
||||||
|
|
|
@ -4,79 +4,79 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(unter)"])
|
@pytest.mark.parametrize("text", ["(unter)"])
|
||||||
def test_de_tokenizer_splits_no_special(de_tokenizer, text):
|
def test_de_tokenizer_splits_no_special(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["unter'm"])
|
@pytest.mark.parametrize("text", ["unter'm"])
|
||||||
def test_de_tokenizer_splits_no_punct(de_tokenizer, text):
|
def test_de_tokenizer_splits_no_punct(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(unter'm"])
|
@pytest.mark.parametrize("text", ["(unter'm"])
|
||||||
def test_de_tokenizer_splits_prefix_punct(de_tokenizer, text):
|
def test_de_tokenizer_splits_prefix_punct(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["unter'm)"])
|
@pytest.mark.parametrize("text", ["unter'm)"])
|
||||||
def test_de_tokenizer_splits_suffix_punct(de_tokenizer, text):
|
def test_de_tokenizer_splits_suffix_punct(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(unter'm)"])
|
@pytest.mark.parametrize("text", ["(unter'm)"])
|
||||||
def test_de_tokenizer_splits_even_wrap(de_tokenizer, text):
|
def test_de_tokenizer_splits_even_wrap(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(unter'm?)"])
|
@pytest.mark.parametrize("text", ["(unter'm?)"])
|
||||||
def test_de_tokenizer_splits_uneven_wrap(de_tokenizer, text):
|
def test_de_tokenizer_splits_uneven_wrap(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
|
@pytest.mark.parametrize("text,length", [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
|
||||||
def test_de_tokenizer_splits_prefix_interact(de_tokenizer, text, length):
|
def test_de_tokenizer_splits_prefix_interact(de_tokenizer, text, length):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["z.B.)"])
|
@pytest.mark.parametrize("text", ["z.B.)"])
|
||||||
def test_de_tokenizer_splits_suffix_interact(de_tokenizer, text):
|
def test_de_tokenizer_splits_suffix_interact(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(z.B.)"])
|
@pytest.mark.parametrize("text", ["(z.B.)"])
|
||||||
def test_de_tokenizer_splits_even_wrap_interact(de_tokenizer, text):
|
def test_de_tokenizer_splits_even_wrap_interact(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(z.B.?)"])
|
@pytest.mark.parametrize("text", ["(z.B.?)"])
|
||||||
def test_de_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
|
def test_de_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||||
def test_de_tokenizer_splits_numeric_range(de_tokenizer, text):
|
def test_de_tokenizer_splits_numeric_range(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"])
|
@pytest.mark.parametrize("text", ["blau.Rot", "Hallo.Welt"])
|
||||||
def test_de_tokenizer_splits_period_infix(de_tokenizer, text):
|
def test_de_tokenizer_splits_period_infix(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"])
|
@pytest.mark.parametrize("text", ["Hallo,Welt", "eins,zwei"])
|
||||||
def test_de_tokenizer_splits_comma_infix(de_tokenizer, text):
|
def test_de_tokenizer_splits_comma_infix(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -85,13 +85,13 @@ def test_de_tokenizer_splits_comma_infix(de_tokenizer, text):
|
||||||
assert tokens[2].text == text.split(",")[1]
|
assert tokens[2].text == text.split(",")[1]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"])
|
@pytest.mark.parametrize("text", ["blau...Rot", "blau...rot"])
|
||||||
def test_de_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
|
def test_de_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt'])
|
@pytest.mark.parametrize("text", ["Islam-Konferenz", "Ost-West-Konflikt"])
|
||||||
def test_de_tokenizer_keeps_hyphens(de_tokenizer, text):
|
def test_de_tokenizer_keeps_hyphens(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -22,19 +22,27 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
|
||||||
assert len(tokens) == 109
|
assert len(tokens) == 109
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', [
|
@pytest.mark.parametrize(
|
||||||
"Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
|
"text",
|
||||||
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
|
[
|
||||||
"Kraftfahrzeug-Haftpflichtversicherung",
|
"Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
|
||||||
"Vakuum-Mittelfrequenz-Induktionsofen"])
|
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
|
||||||
|
"Kraftfahrzeug-Haftpflichtversicherung",
|
||||||
|
"Vakuum-Mittelfrequenz-Induktionsofen",
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_de_tokenizer_handles_long_words(de_tokenizer, text):
|
def test_de_tokenizer_handles_long_words(de_tokenizer, text):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [
|
@pytest.mark.parametrize(
|
||||||
("»Was ist mit mir geschehen?«, dachte er.", 12),
|
"text,length",
|
||||||
("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15)])
|
[
|
||||||
|
("»Was ist mit mir geschehen?«, dachte er.", 12),
|
||||||
|
("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_de_tokenizer_handles_examples(de_tokenizer, text, length):
|
def test_de_tokenizer_handles_examples(de_tokenizer, text, length):
|
||||||
tokens = de_tokenizer(text)
|
tokens = de_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["αριθ.", "τρισ.", "δισ.", "σελ."])
|
@pytest.mark.parametrize("text", ["αριθ.", "τρισ.", "δισ.", "σελ."])
|
||||||
def test_el_tokenizer_handles_abbr(el_tokenizer, text):
|
def test_el_tokenizer_handles_abbr(el_tokenizer, text):
|
||||||
tokens = el_tokenizer(text)
|
tokens = el_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -13,12 +13,22 @@ def test_el_tokenizer_handles_long_text(el_tokenizer):
|
||||||
assert len(tokens) == 54
|
assert len(tokens) == 54
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length',[
|
@pytest.mark.parametrize(
|
||||||
("Διοικητικά η Ελλάδα διαιρείται σε 13 Περιφέρειες.", 8),
|
"text,length",
|
||||||
("Η εκπαίδευση στην Ελλάδα χωρίζεται κυρίως σε τρία επίπεδα.", 10),
|
[
|
||||||
("Η Ελλάδα είναι μία από τις χώρες της Ευρωπαϊκής Ένωσης (ΕΕ) που διαθέτει σηµαντικό ορυκτό πλούτο.", 19),
|
("Διοικητικά η Ελλάδα διαιρείται σε 13 Περιφέρειες.", 8),
|
||||||
("Η ναυτιλία αποτέλεσε ένα σημαντικό στοιχείο της Ελληνικής οικονομικής δραστηριότητας από τα αρχαία χρόνια.", 15),
|
("Η εκπαίδευση στην Ελλάδα χωρίζεται κυρίως σε τρία επίπεδα.", 10),
|
||||||
("Η Ελλάδα είναι μέλος σε αρκετούς διεθνείς οργανισμούς.", 9)])
|
(
|
||||||
def test_el_tokenizer_handles_cnts(el_tokenizer,text, length):
|
"Η Ελλάδα είναι μία από τις χώρες της Ευρωπαϊκής Ένωσης (ΕΕ) που διαθέτει σηµαντικό ορυκτό πλούτο.",
|
||||||
|
19,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Η ναυτιλία αποτέλεσε ένα σημαντικό στοιχείο της Ελληνικής οικονομικής δραστηριότητας από τα αρχαία χρόνια.",
|
||||||
|
15,
|
||||||
|
),
|
||||||
|
("Η Ελλάδα είναι μέλος σε αρκετούς διεθνείς οργανισμούς.", 9),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_el_tokenizer_handles_cnts(el_tokenizer, text, length):
|
||||||
tokens = el_tokenizer(text)
|
tokens = el_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
|
@ -12,29 +12,66 @@ from spacy.util import compile_infix_regex
|
||||||
def custom_en_tokenizer(en_vocab):
|
def custom_en_tokenizer(en_vocab):
|
||||||
prefix_re = compile_prefix_regex(English.Defaults.prefixes)
|
prefix_re = compile_prefix_regex(English.Defaults.prefixes)
|
||||||
suffix_re = compile_suffix_regex(English.Defaults.suffixes)
|
suffix_re = compile_suffix_regex(English.Defaults.suffixes)
|
||||||
custom_infixes = ['\.\.\.+',
|
custom_infixes = [
|
||||||
'(?<=[0-9])-(?=[0-9])',
|
"\.\.\.+",
|
||||||
# '(?<=[0-9]+),(?=[0-9]+)',
|
"(?<=[0-9])-(?=[0-9])",
|
||||||
'[0-9]+(,[0-9]+)+',
|
# '(?<=[0-9]+),(?=[0-9]+)',
|
||||||
'[\[\]!&:,()\*—–\/-]']
|
"[0-9]+(,[0-9]+)+",
|
||||||
|
"[\[\]!&:,()\*—–\/-]",
|
||||||
|
]
|
||||||
infix_re = compile_infix_regex(custom_infixes)
|
infix_re = compile_infix_regex(custom_infixes)
|
||||||
return Tokenizer(en_vocab,
|
return Tokenizer(
|
||||||
English.Defaults.tokenizer_exceptions,
|
en_vocab,
|
||||||
prefix_re.search,
|
English.Defaults.tokenizer_exceptions,
|
||||||
suffix_re.search,
|
prefix_re.search,
|
||||||
infix_re.finditer,
|
suffix_re.search,
|
||||||
token_match=None)
|
infix_re.finditer,
|
||||||
|
token_match=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
|
def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
|
||||||
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion."
|
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion."
|
||||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||||
assert context == ['The', '8', 'and', '10', '-', 'county', 'definitions',
|
assert context == [
|
||||||
'are', 'not', 'used', 'for', 'the', 'greater',
|
"The",
|
||||||
'Southern', 'California', 'Megaregion', '.']
|
"8",
|
||||||
|
"and",
|
||||||
|
"10",
|
||||||
|
"-",
|
||||||
|
"county",
|
||||||
|
"definitions",
|
||||||
|
"are",
|
||||||
|
"not",
|
||||||
|
"used",
|
||||||
|
"for",
|
||||||
|
"the",
|
||||||
|
"greater",
|
||||||
|
"Southern",
|
||||||
|
"California",
|
||||||
|
"Megaregion",
|
||||||
|
".",
|
||||||
|
]
|
||||||
# the trailing '-' may cause Assertion Error
|
# the trailing '-' may cause Assertion Error
|
||||||
sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion."
|
sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion."
|
||||||
context = [word.text for word in custom_en_tokenizer(sentence)]
|
context = [word.text for word in custom_en_tokenizer(sentence)]
|
||||||
assert context == ['The', '8', '-', 'and', '10', '-', 'county',
|
assert context == [
|
||||||
'definitions', 'are', 'not', 'used', 'for', 'the',
|
"The",
|
||||||
'greater', 'Southern', 'California', 'Megaregion', '.']
|
"8",
|
||||||
|
"-",
|
||||||
|
"and",
|
||||||
|
"10",
|
||||||
|
"-",
|
||||||
|
"county",
|
||||||
|
"definitions",
|
||||||
|
"are",
|
||||||
|
"not",
|
||||||
|
"used",
|
||||||
|
"for",
|
||||||
|
"the",
|
||||||
|
"greater",
|
||||||
|
"Southern",
|
||||||
|
"California",
|
||||||
|
"Megaregion",
|
||||||
|
".",
|
||||||
|
]
|
||||||
|
|
|
@ -15,13 +15,15 @@ def test_en_tokenizer_handles_basic_contraction(en_tokenizer):
|
||||||
assert tokens[4].text == "!"
|
assert tokens[4].text == "!"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
@pytest.mark.parametrize("text", ["`ain't", """"isn't""", "can't!"])
|
||||||
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
|
@pytest.mark.parametrize(
|
||||||
|
"text_poss,text", [("Robin's", "Robin"), ("Alexis's", "Alexis")]
|
||||||
|
)
|
||||||
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
||||||
tokens = en_tokenizer(text_poss)
|
tokens = en_tokenizer(text_poss)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -29,7 +31,7 @@ def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
||||||
assert tokens[1].text == "'s"
|
assert tokens[1].text == "'s"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
|
@pytest.mark.parametrize("text", ["schools'", "Alexis'"])
|
||||||
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -37,14 +39,14 @@ def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
||||||
assert tokens[1].text == "'"
|
assert tokens[1].text == "'"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
|
@pytest.mark.parametrize("text", ["'em", "nothin'", "ol'"])
|
||||||
def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
assert tokens[0].text == text
|
assert tokens[0].text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
|
@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"])
|
||||||
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -53,7 +55,9 @@ def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
||||||
assert tokens[1].lemma_ == "will"
|
assert tokens[1].lemma_ == "will"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
|
@pytest.mark.parametrize(
|
||||||
|
"text_lower,text_title", [("can't", "Can't"), ("ain't", "Ain't")]
|
||||||
|
)
|
||||||
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
||||||
tokens_lower = en_tokenizer(text_lower)
|
tokens_lower = en_tokenizer(text_lower)
|
||||||
tokens_title = en_tokenizer(text_title)
|
tokens_title = en_tokenizer(text_title)
|
||||||
|
@ -62,21 +66,23 @@ def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_titl
|
||||||
assert tokens_lower[1].text == tokens_title[1].text
|
assert tokens_lower[1].text == tokens_title[1].text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
|
@pytest.mark.parametrize("pron", ["I", "You", "He", "She", "It", "We", "They"])
|
||||||
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
|
@pytest.mark.parametrize("contraction", ["'ll", "'d"])
|
||||||
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
||||||
tokens = en_tokenizer(pron + contraction)
|
tokens = en_tokenizer(pron + contraction)
|
||||||
assert tokens[0].text == pron
|
assert tokens[0].text == pron
|
||||||
assert tokens[1].text == contraction
|
assert tokens[1].text == contraction
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
@pytest.mark.parametrize("exc", ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
||||||
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
||||||
tokens = en_tokenizer(exc)
|
tokens = en_tokenizer(exc)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
|
@pytest.mark.parametrize(
|
||||||
|
"wo_punct,w_punct", [("We've", "``We've"), ("couldn't", "couldn't)")]
|
||||||
|
)
|
||||||
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
||||||
tokens = en_tokenizer(wo_punct)
|
tokens = en_tokenizer(wo_punct)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -84,7 +90,7 @@ def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
|
@pytest.mark.parametrize("text", ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
|
||||||
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
|
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
@ -97,20 +103,24 @@ def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
|
||||||
assert tokens[3].text == "i.e."
|
assert tokens[3].text == "i.e."
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["1am", "12a.m.", "11p.m.", "4pm"])
|
@pytest.mark.parametrize("text", ["1am", "12a.m.", "11p.m.", "4pm"])
|
||||||
def test_en_tokenizer_handles_times(en_tokenizer, text):
|
def test_en_tokenizer_handles_times(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[1].lemma_ in ["a.m.", "p.m."]
|
assert tokens[1].lemma_ in ["a.m.", "p.m."]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])])
|
@pytest.mark.parametrize(
|
||||||
|
"text,norms", [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])]
|
||||||
|
)
|
||||||
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
|
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert [token.norm_ for token in tokens] == norms
|
assert [token.norm_ for token in tokens] == norms
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")])
|
@pytest.mark.parametrize(
|
||||||
|
"text,norm", [("radicalised", "radicalized"), ("cuz", "because")]
|
||||||
|
)
|
||||||
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
|
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert tokens[0].norm_ == norm
|
assert tokens[0].norm_ == norm
|
||||||
|
|
|
@ -12,14 +12,25 @@ from ...util import get_doc
|
||||||
def test_en_noun_chunks_not_nested(en_tokenizer):
|
def test_en_noun_chunks_not_nested(en_tokenizer):
|
||||||
text = "Peter has chronic command and control issues"
|
text = "Peter has chronic command and control issues"
|
||||||
heads = [1, 0, 4, 3, -1, -2, -5]
|
heads = [1, 0, 4, 3, -1, -2, -5]
|
||||||
deps = ['nsubj', 'ROOT', 'amod', 'nmod', 'cc', 'conj', 'dobj']
|
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
tokens.from_array(
|
tokens.from_array(
|
||||||
[HEAD, DEP],
|
[HEAD, DEP],
|
||||||
numpy.asarray([[1, nsubj], [0, root], [4, amod], [3, nmod], [-1, cc],
|
numpy.asarray(
|
||||||
[-2, conj], [-5, dobj]], dtype='uint64'))
|
[
|
||||||
tokens.noun_chunks_iterator = SYNTAX_ITERATORS['noun_chunks']
|
[1, nsubj],
|
||||||
|
[0, root],
|
||||||
|
[4, amod],
|
||||||
|
[3, nmod],
|
||||||
|
[-1, cc],
|
||||||
|
[-2, conj],
|
||||||
|
[-5, dobj],
|
||||||
|
],
|
||||||
|
dtype="uint64",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
tokens.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"]
|
||||||
word_occurred = {}
|
word_occurred = {}
|
||||||
for chunk in tokens.noun_chunks:
|
for chunk in tokens.noun_chunks:
|
||||||
for word in chunk:
|
for word in chunk:
|
||||||
|
|
|
@ -7,22 +7,28 @@ from ...util import get_doc
|
||||||
def test_en_parser_noun_chunks_standard(en_tokenizer):
|
def test_en_parser_noun_chunks_standard(en_tokenizer):
|
||||||
text = "A base phrase should be recognized."
|
text = "A base phrase should be recognized."
|
||||||
heads = [2, 1, 3, 2, 1, 0, -1]
|
heads = [2, 1, 3, 2, 1, 0, -1]
|
||||||
tags = ['DT', 'JJ', 'NN', 'MD', 'VB', 'VBN', '.']
|
tags = ["DT", "JJ", "NN", "MD", "VB", "VBN", "."]
|
||||||
deps = ['det', 'amod', 'nsubjpass', 'aux', 'auxpass', 'ROOT', 'punct']
|
deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
doc = get_doc(
|
||||||
|
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||||
|
)
|
||||||
chunks = list(doc.noun_chunks)
|
chunks = list(doc.noun_chunks)
|
||||||
assert len(chunks) == 1
|
assert len(chunks) == 1
|
||||||
assert chunks[0].text_with_ws == "A base phrase "
|
assert chunks[0].text_with_ws == "A base phrase "
|
||||||
|
|
||||||
|
|
||||||
def test_en_parser_noun_chunks_coordinated(en_tokenizer):
|
def test_en_parser_noun_chunks_coordinated(en_tokenizer):
|
||||||
|
# fmt: off
|
||||||
text = "A base phrase and a good phrase are often the same."
|
text = "A base phrase and a good phrase are often the same."
|
||||||
heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
|
heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
|
||||||
tags = ['DT', 'NN', 'NN', 'CC', 'DT', 'JJ', 'NN', 'VBP', 'RB', 'DT', 'JJ', '.']
|
tags = ["DT", "NN", "NN", "CC", "DT", "JJ", "NN", "VBP", "RB", "DT", "JJ", "."]
|
||||||
deps = ['det', 'compound', 'nsubj', 'cc', 'det', 'amod', 'conj', 'ROOT', 'advmod', 'det', 'attr', 'punct']
|
deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]
|
||||||
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
doc = get_doc(
|
||||||
|
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||||
|
)
|
||||||
chunks = list(doc.noun_chunks)
|
chunks = list(doc.noun_chunks)
|
||||||
assert len(chunks) == 2
|
assert len(chunks) == 2
|
||||||
assert chunks[0].text_with_ws == "A base phrase "
|
assert chunks[0].text_with_ws == "A base phrase "
|
||||||
|
@ -32,10 +38,12 @@ def test_en_parser_noun_chunks_coordinated(en_tokenizer):
|
||||||
def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
|
def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
|
||||||
text = "A phrase with another phrase occurs."
|
text = "A phrase with another phrase occurs."
|
||||||
heads = [1, 4, -1, 1, -2, 0, -1]
|
heads = [1, 4, -1, 1, -2, 0, -1]
|
||||||
tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ', '.']
|
tags = ["DT", "NN", "IN", "DT", "NN", "VBZ", "."]
|
||||||
deps = ['det', 'nsubj', 'prep', 'det', 'pobj', 'ROOT', 'punct']
|
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
doc = get_doc(
|
||||||
|
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||||
|
)
|
||||||
chunks = list(doc.noun_chunks)
|
chunks = list(doc.noun_chunks)
|
||||||
assert len(chunks) == 2
|
assert len(chunks) == 2
|
||||||
assert chunks[0].text_with_ws == "A phrase "
|
assert chunks[0].text_with_ws == "A phrase "
|
||||||
|
@ -43,12 +51,16 @@ def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
|
def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
|
||||||
|
# fmt: off
|
||||||
text = "Sam, my brother, arrived to the house."
|
text = "Sam, my brother, arrived to the house."
|
||||||
heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
|
heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
|
||||||
tags = ['NNP', ',', 'PRP$', 'NN', ',', 'VBD', 'IN', 'DT', 'NN', '.']
|
tags = ["NNP", ",", "PRP$", "NN", ",", "VBD", "IN", "DT", "NN", "."]
|
||||||
deps = ['nsubj', 'punct', 'poss', 'appos', 'punct', 'ROOT', 'prep', 'det', 'pobj', 'punct']
|
deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]
|
||||||
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
doc = get_doc(
|
||||||
|
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||||
|
)
|
||||||
chunks = list(doc.noun_chunks)
|
chunks = list(doc.noun_chunks)
|
||||||
assert len(chunks) == 3
|
assert len(chunks) == 3
|
||||||
assert chunks[0].text_with_ws == "Sam "
|
assert chunks[0].text_with_ws == "Sam "
|
||||||
|
@ -59,10 +71,12 @@ def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
|
||||||
def test_en_parser_noun_chunks_dative(en_tokenizer):
|
def test_en_parser_noun_chunks_dative(en_tokenizer):
|
||||||
text = "She gave Bob a raise."
|
text = "She gave Bob a raise."
|
||||||
heads = [1, 0, -1, 1, -3, -4]
|
heads = [1, 0, -1, 1, -3, -4]
|
||||||
tags = ['PRP', 'VBD', 'NNP', 'DT', 'NN', '.']
|
tags = ["PRP", "VBD", "NNP", "DT", "NN", "."]
|
||||||
deps = ['nsubj', 'ROOT', 'dative', 'det', 'dobj', 'punct']
|
deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
doc = get_doc(
|
||||||
|
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
|
||||||
|
)
|
||||||
chunks = list(doc.noun_chunks)
|
chunks = list(doc.noun_chunks)
|
||||||
assert len(chunks) == 3
|
assert len(chunks) == 3
|
||||||
assert chunks[0].text_with_ws == "She "
|
assert chunks[0].text_with_ws == "She "
|
||||||
|
|
|
@ -4,85 +4,85 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(can)"])
|
@pytest.mark.parametrize("text", ["(can)"])
|
||||||
def test_en_tokenizer_splits_no_special(en_tokenizer, text):
|
def test_en_tokenizer_splits_no_special(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["can't"])
|
@pytest.mark.parametrize("text", ["can't"])
|
||||||
def test_en_tokenizer_splits_no_punct(en_tokenizer, text):
|
def test_en_tokenizer_splits_no_punct(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(can't"])
|
@pytest.mark.parametrize("text", ["(can't"])
|
||||||
def test_en_tokenizer_splits_prefix_punct(en_tokenizer, text):
|
def test_en_tokenizer_splits_prefix_punct(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["can't)"])
|
@pytest.mark.parametrize("text", ["can't)"])
|
||||||
def test_en_tokenizer_splits_suffix_punct(en_tokenizer, text):
|
def test_en_tokenizer_splits_suffix_punct(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(can't)"])
|
@pytest.mark.parametrize("text", ["(can't)"])
|
||||||
def test_en_tokenizer_splits_even_wrap(en_tokenizer, text):
|
def test_en_tokenizer_splits_even_wrap(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(can't?)"])
|
@pytest.mark.parametrize("text", ["(can't?)"])
|
||||||
def test_en_tokenizer_splits_uneven_wrap(en_tokenizer, text):
|
def test_en_tokenizer_splits_uneven_wrap(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
|
@pytest.mark.parametrize("text,length", [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
|
||||||
def test_en_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
|
def test_en_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["U.S.)"])
|
@pytest.mark.parametrize("text", ["U.S.)"])
|
||||||
def test_en_tokenizer_splits_suffix_interact(en_tokenizer, text):
|
def test_en_tokenizer_splits_suffix_interact(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(U.S.)"])
|
@pytest.mark.parametrize("text", ["(U.S.)"])
|
||||||
def test_en_tokenizer_splits_even_wrap_interact(en_tokenizer, text):
|
def test_en_tokenizer_splits_even_wrap_interact(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(U.S.?)"])
|
@pytest.mark.parametrize("text", ["(U.S.?)"])
|
||||||
def test_en_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text):
|
def test_en_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["best-known"])
|
@pytest.mark.parametrize("text", ["best-known"])
|
||||||
def test_en_tokenizer_splits_hyphens(en_tokenizer, text):
|
def test_en_tokenizer_splits_hyphens(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||||
def test_en_tokenizer_splits_numeric_range(en_tokenizer, text):
|
def test_en_tokenizer_splits_numeric_range(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["best.Known", "Hello.World"])
|
@pytest.mark.parametrize("text", ["best.Known", "Hello.World"])
|
||||||
def test_en_tokenizer_splits_period_infix(en_tokenizer, text):
|
def test_en_tokenizer_splits_period_infix(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Hello,world", "one,two"])
|
@pytest.mark.parametrize("text", ["Hello,world", "one,two"])
|
||||||
def test_en_tokenizer_splits_comma_infix(en_tokenizer, text):
|
def test_en_tokenizer_splits_comma_infix(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -91,7 +91,7 @@ def test_en_tokenizer_splits_comma_infix(en_tokenizer, text):
|
||||||
assert tokens[2].text == text.split(",")[1]
|
assert tokens[2].text == text.split(",")[1]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["best...Known", "best...known"])
|
@pytest.mark.parametrize("text", ["best...Known", "best...known"])
|
||||||
def test_en_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
|
def test_en_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -126,8 +126,10 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer):
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
|
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
|
||||||
# Re Issue #225
|
# Re Issue #225
|
||||||
tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """
|
tokens = en_tokenizer(
|
||||||
"""you'll have to walk there.\u2014Ariel.""")
|
"""Will this road take me to Puddleton?\u2014No, """
|
||||||
|
"""you'll have to walk there.\u2014Ariel."""
|
||||||
|
)
|
||||||
assert tokens[6].text == "Puddleton"
|
assert tokens[6].text == "Puddleton"
|
||||||
assert tokens[7].text == "?"
|
assert tokens[7].text == "?"
|
||||||
assert tokens[8].text == "\u2014"
|
assert tokens[8].text == "\u2014"
|
||||||
|
|
|
@ -6,19 +6,19 @@ from spacy.util import compile_prefix_regex
|
||||||
from spacy.lang.punctuation import TOKENIZER_PREFIXES
|
from spacy.lang.punctuation import TOKENIZER_PREFIXES
|
||||||
|
|
||||||
|
|
||||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
PUNCT_OPEN = ["(", "[", "{", "*"]
|
||||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
PUNCT_CLOSE = [")", "]", "}", "*"]
|
||||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
@pytest.mark.parametrize("text", ["(", "((", "<"])
|
||||||
def test_en_tokenizer_handles_only_punct(en_tokenizer, text):
|
def test_en_tokenizer_handles_only_punct(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == len(text)
|
assert len(tokens) == len(text)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize("text", ["Hello"])
|
||||||
def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
||||||
tokens = en_tokenizer(punct + text)
|
tokens = en_tokenizer(punct + text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -26,8 +26,8 @@ def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
||||||
assert tokens[1].text == text
|
assert tokens[1].text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize("text", ["Hello"])
|
||||||
def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
||||||
tokens = en_tokenizer(text + punct)
|
tokens = en_tokenizer(text + punct)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -35,9 +35,9 @@ def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
||||||
assert tokens[1].text == punct
|
assert tokens[1].text == punct
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('punct_add', ["`"])
|
@pytest.mark.parametrize("punct_add", ["`"])
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize("text", ["Hello"])
|
||||||
def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
|
def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
|
||||||
tokens = en_tokenizer(punct + punct_add + text)
|
tokens = en_tokenizer(punct + punct_add + text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -46,9 +46,9 @@ def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add,
|
||||||
assert tokens[2].text == text
|
assert tokens[2].text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('punct_add', ["'"])
|
@pytest.mark.parametrize("punct_add", ["'"])
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize("text", ["Hello"])
|
||||||
def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
|
def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
|
||||||
tokens = en_tokenizer(text + punct + punct_add)
|
tokens = en_tokenizer(text + punct + punct_add)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -57,8 +57,8 @@ def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add
|
||||||
assert tokens[2].text == punct_add
|
assert tokens[2].text == punct_add
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize("text", ["Hello"])
|
||||||
def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
||||||
tokens = en_tokenizer(punct + punct + punct + text)
|
tokens = en_tokenizer(punct + punct + punct + text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
@ -66,8 +66,8 @@ def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
||||||
assert tokens[3].text == text
|
assert tokens[3].text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize("text", ["Hello"])
|
||||||
def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
||||||
tokens = en_tokenizer(text + punct + punct + punct)
|
tokens = en_tokenizer(text + punct + punct + punct)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
@ -75,14 +75,14 @@ def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
||||||
assert tokens[1].text == punct
|
assert tokens[1].text == punct
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["'The"])
|
@pytest.mark.parametrize("text", ["'The"])
|
||||||
def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text):
|
def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].text == "'"
|
assert tokens[0].text == "'"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Hello''"])
|
@pytest.mark.parametrize("text", ["Hello''"])
|
||||||
def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -90,10 +90,11 @@ def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
||||||
assert len(tokens_punct) == 1
|
assert len(tokens_punct) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize("text", ["Hello"])
|
||||||
def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
def test_en_tokenizer_splits_open_close_punct(
|
||||||
punct_close, text):
|
en_tokenizer, punct_open, punct_close, text
|
||||||
|
):
|
||||||
tokens = en_tokenizer(punct_open + text + punct_close)
|
tokens = en_tokenizer(punct_open + text + punct_close)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[0].text == punct_open
|
assert tokens[0].text == punct_open
|
||||||
|
@ -101,11 +102,12 @@ def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
||||||
assert tokens[2].text == punct_close
|
assert tokens[2].text == punct_close
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize("text", ["Hello"])
|
||||||
def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
def test_en_tokenizer_two_diff_punct(
|
||||||
punct_open2, punct_close2, text):
|
en_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
|
||||||
|
):
|
||||||
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[0].text == punct_open2
|
assert tokens[0].text == punct_open2
|
||||||
|
@ -115,7 +117,7 @@ def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
||||||
assert tokens[4].text == punct_close2
|
assert tokens[4].text == punct_close2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
@pytest.mark.parametrize("text,punct", [("(can't", "(")])
|
||||||
def test_en_tokenizer_splits_pre_punct_regex(text, punct):
|
def test_en_tokenizer_splits_pre_punct_regex(text, punct):
|
||||||
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||||
match = en_search_prefixes(text)
|
match = en_search_prefixes(text)
|
||||||
|
|
|
@ -6,8 +6,8 @@ import pytest
|
||||||
from ...util import get_doc, apply_transition_sequence
|
from ...util import get_doc, apply_transition_sequence
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["A test sentence"])
|
@pytest.mark.parametrize("text", ["A test sentence"])
|
||||||
@pytest.mark.parametrize('punct', ['.', '!', '?', ''])
|
@pytest.mark.parametrize("punct", [".", "!", "?", ""])
|
||||||
def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
||||||
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
||||||
tokens = en_tokenizer(text + punct)
|
tokens = en_tokenizer(text + punct)
|
||||||
|
@ -19,16 +19,18 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_en_sentence_breaks(en_tokenizer, en_parser):
|
def test_en_sentence_breaks(en_tokenizer, en_parser):
|
||||||
|
# fmt: off
|
||||||
text = "This is a sentence . This is another one ."
|
text = "This is a sentence . This is another one ."
|
||||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
|
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
|
||||||
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
|
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
|
||||||
'attr', 'punct']
|
"attr", "punct"]
|
||||||
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT',
|
transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct", "B-ROOT",
|
||||||
'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct']
|
"L-nsubj", "S", "L-attr", "R-attr", "D", "R-punct"]
|
||||||
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
apply_transition_sequence(en_parser, doc, transition)
|
apply_transition_sequence(en_parser, doc, transition)
|
||||||
assert len(list(doc.sents)) == 2
|
assert len(list(doc.sents)) == 2
|
||||||
for token in doc:
|
for token in doc:
|
||||||
assert token.dep != 0 or token.is_space
|
assert token.dep != 0 or token.is_space
|
||||||
assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
|
assert [token.head.i for token in doc] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
|
||||||
|
|
|
@ -6,10 +6,10 @@ from ...util import get_doc
|
||||||
|
|
||||||
def test_en_tagger_load_morph_exc(en_tokenizer):
|
def test_en_tagger_load_morph_exc(en_tokenizer):
|
||||||
text = "I like his style."
|
text = "I like his style."
|
||||||
tags = ['PRP', 'VBP', 'PRP$', 'NN', '.']
|
tags = ["PRP", "VBP", "PRP$", "NN", "."]
|
||||||
morph_exc = {'VBP': {'like': {'lemma': 'luck'}}}
|
morph_exc = {"VBP": {"like": {"lemma": "luck"}}}
|
||||||
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
|
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags)
|
||||||
assert doc[1].tag_ == 'VBP'
|
assert doc[1].tag_ == "VBP"
|
||||||
assert doc[1].lemma_ == 'luck'
|
assert doc[1].lemma_ == "luck"
|
||||||
|
|
|
@ -20,30 +20,48 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||||
assert len(tokens) == 76
|
assert len(tokens) == 76
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [
|
@pytest.mark.parametrize(
|
||||||
("The U.S. Army likes Shock and Awe.", 8),
|
"text,length",
|
||||||
("U.N. regulations are not a part of their concern.", 10),
|
[
|
||||||
("“Isn't it?”", 6),
|
("The U.S. Army likes Shock and Awe.", 8),
|
||||||
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
("U.N. regulations are not a part of their concern.", 10),
|
||||||
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
("“Isn't it?”", 6),
|
||||||
("They ran about 10km.", 6),
|
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
|
||||||
pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))])
|
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
||||||
|
("They ran about 10km.", 6),
|
||||||
|
pytest.param(
|
||||||
|
"But then the 6,000-year ice age came...", 10, marks=pytest.mark.xfail()
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_en_tokenizer_handles_cnts(en_tokenizer, text, length):
|
def test_en_tokenizer_handles_cnts(en_tokenizer, text, length):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,match', [
|
@pytest.mark.parametrize(
|
||||||
('10', True), ('1', True), ('10,000', True), ('10,00', True),
|
"text,match",
|
||||||
('999.0', True), ('one', True), ('two', True), ('billion', True),
|
[
|
||||||
('dog', False), (',', False), ('1/2', True)])
|
("10", True),
|
||||||
|
("1", True),
|
||||||
|
("10,000", True),
|
||||||
|
("10,00", True),
|
||||||
|
("999.0", True),
|
||||||
|
("one", True),
|
||||||
|
("two", True),
|
||||||
|
("billion", True),
|
||||||
|
("dog", False),
|
||||||
|
(",", False),
|
||||||
|
("1/2", True),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_lex_attrs_like_number(en_tokenizer, text, match):
|
def test_lex_attrs_like_number(en_tokenizer, text, match):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
assert tokens[0].like_num == match
|
assert tokens[0].like_num == match
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('word', ['eleven'])
|
@pytest.mark.parametrize("word", ["eleven"])
|
||||||
def test_en_lex_attrs_capitals(word):
|
def test_en_lex_attrs_capitals(word):
|
||||||
assert like_num(word)
|
assert like_num(word)
|
||||||
assert like_num(word.upper())
|
assert like_num(word.upper())
|
||||||
|
|
|
@ -4,11 +4,15 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,lemma', [
|
@pytest.mark.parametrize(
|
||||||
("aprox.", "aproximadamente"),
|
"text,lemma",
|
||||||
("esq.", "esquina"),
|
[
|
||||||
("pág.", "página"),
|
("aprox.", "aproximadamente"),
|
||||||
("p.ej.", "por ejemplo")])
|
("esq.", "esquina"),
|
||||||
|
("pág.", "página"),
|
||||||
|
("p.ej.", "por ejemplo"),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_es_tokenizer_handles_abbr(es_tokenizer, text, lemma):
|
def test_es_tokenizer_handles_abbr(es_tokenizer, text, lemma):
|
||||||
tokens = es_tokenizer(text)
|
tokens = es_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -20,12 +20,16 @@ en Montevideo y que pregona las bondades de la vida austera."""
|
||||||
assert len(tokens) == 90
|
assert len(tokens) == 90
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [
|
@pytest.mark.parametrize(
|
||||||
("¿Por qué José Mujica?", 6),
|
"text,length",
|
||||||
("“¿Oh no?”", 6),
|
[
|
||||||
("""¡Sí! "Vámonos", contestó José Arcadio Buendía""", 11),
|
("¿Por qué José Mujica?", 6),
|
||||||
("Corrieron aprox. 10km.", 5),
|
("“¿Oh no?”", 6),
|
||||||
("Y entonces por qué...", 5)])
|
("""¡Sí! "Vámonos", contestó José Arcadio Buendía""", 11),
|
||||||
|
("Corrieron aprox. 10km.", 5),
|
||||||
|
("Y entonces por qué...", 5),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_es_tokenizer_handles_cnts(es_tokenizer, text, length):
|
def test_es_tokenizer_handles_cnts(es_tokenizer, text, length):
|
||||||
tokens = es_tokenizer(text)
|
tokens = es_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
|
@ -5,12 +5,15 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
ABBREVIATION_TESTS = [
|
ABBREVIATION_TESTS = [
|
||||||
('Hyvää uutta vuotta t. siht. Niemelä!', ['Hyvää', 'uutta', 'vuotta', 't.', 'siht.', 'Niemelä', '!']),
|
(
|
||||||
('Paino on n. 2.2 kg', ['Paino', 'on', 'n.', '2.2', 'kg'])
|
"Hyvää uutta vuotta t. siht. Niemelä!",
|
||||||
|
["Hyvää", "uutta", "vuotta", "t.", "siht.", "Niemelä", "!"],
|
||||||
|
),
|
||||||
|
("Paino on n. 2.2 kg", ["Paino", "on", "n.", "2.2", "kg"]),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', ABBREVIATION_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
|
||||||
def test_fi_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens):
|
def test_fi_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens):
|
||||||
tokens = fi_tokenizer(text)
|
tokens = fi_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
|
|
@ -2,26 +2,26 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from .... import util
|
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
|
||||||
def fr_tokenizer():
|
|
||||||
return util.get_lang_class('fr').Defaults.create_tokenizer()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', [
|
@pytest.mark.parametrize(
|
||||||
"aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal"])
|
"text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal"]
|
||||||
|
)
|
||||||
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
|
||||||
tokens = fr_tokenizer(text)
|
tokens = fr_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,lemma', [
|
@pytest.mark.parametrize(
|
||||||
("janv.", "janvier"),
|
"text,lemma",
|
||||||
("juill.", "juillet"),
|
[
|
||||||
("Dr.", "docteur"),
|
("janv.", "janvier"),
|
||||||
("av.", "avant"),
|
("juill.", "juillet"),
|
||||||
("sept.", "septembre")])
|
("Dr.", "docteur"),
|
||||||
|
("av.", "avant"),
|
||||||
|
("sept.", "septembre"),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
|
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
|
||||||
tokens = fr_tokenizer(text)
|
tokens = fr_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
@ -57,6 +57,7 @@ def test_fr_tokenizer_handles_title(fr_tokenizer):
|
||||||
assert tokens[2].lemma_ == "ce"
|
assert tokens[2].lemma_ == "ce"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
|
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
|
||||||
text = "Est-ce pas génial?"
|
text = "Est-ce pas génial?"
|
||||||
tokens = fr_tokenizer(text)
|
tokens = fr_tokenizer(text)
|
||||||
|
@ -65,7 +66,7 @@ def test_fr_tokenizer_handles_title_2(fr_tokenizer):
|
||||||
assert tokens[0].lemma_ == "être"
|
assert tokens[0].lemma_ == "être"
|
||||||
|
|
||||||
|
|
||||||
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
|
def test_fr_tokenizer_handles_title_3(fr_tokenizer):
|
||||||
text = "Qu'est-ce que tu fais?"
|
text = "Qu'est-ce que tu fais?"
|
||||||
tokens = fr_tokenizer(text)
|
tokens = fr_tokenizer(text)
|
||||||
assert len(tokens) == 7
|
assert len(tokens) == 7
|
||||||
|
|
|
@ -16,7 +16,9 @@ def test_fr_lemmatizer_noun_verb_2(fr_tokenizer):
|
||||||
assert tokens[4].lemma_ == "être"
|
assert tokens[4].lemma_ == "être"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN")
|
@pytest.mark.xfail(
|
||||||
|
reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN"
|
||||||
|
)
|
||||||
def test_fr_lemmatizer_noun(fr_tokenizer):
|
def test_fr_lemmatizer_noun(fr_tokenizer):
|
||||||
tokens = fr_tokenizer("il y a des Costaricienne.")
|
tokens = fr_tokenizer("il y a des Costaricienne.")
|
||||||
assert tokens[4].lemma_ == "Costaricain"
|
assert tokens[4].lemma_ == "Costaricain"
|
||||||
|
|
|
@ -7,11 +7,12 @@ from spacy.lang.punctuation import TOKENIZER_INFIXES
|
||||||
from spacy.lang.char_classes import ALPHA
|
from spacy.lang.char_classes import ALPHA
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', [
|
@pytest.mark.parametrize(
|
||||||
("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])])
|
"text,expected_tokens", [("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])]
|
||||||
|
)
|
||||||
def test_issue768(text, expected_tokens):
|
def test_issue768(text, expected_tokens):
|
||||||
"""Allow zero-width 'infix' token during the tokenization process."""
|
"""Allow zero-width 'infix' token during the tokenization process."""
|
||||||
SPLIT_INFIX = r'(?<=[{a}]\')(?=[{a}])'.format(a=ALPHA)
|
SPLIT_INFIX = r"(?<=[{a}]\')(?=[{a}])".format(a=ALPHA)
|
||||||
|
|
||||||
class FrenchTest(Language):
|
class FrenchTest(Language):
|
||||||
class Defaults(Language.Defaults):
|
class Defaults(Language.Defaults):
|
||||||
|
|
|
@ -1,13 +1,5 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import pytest
|
|
||||||
|
|
||||||
from .... import util
|
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
|
||||||
def fr_tokenizer():
|
|
||||||
return util.get_lang_class('fr').Defaults.create_tokenizer()
|
|
||||||
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lang.fr.lex_attrs import like_num
|
from spacy.lang.fr.lex_attrs import like_num
|
||||||
|
@ -27,7 +19,7 @@ ou avec un autre vrai humain."""
|
||||||
assert len(tokens) == 113
|
assert len(tokens) == 113
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('word', ['onze', 'onzième'])
|
@pytest.mark.parametrize("word", ["onze", "onzième"])
|
||||||
def test_fr_lex_attrs_capitals(word):
|
def test_fr_lex_attrs_capitals(word):
|
||||||
assert like_num(word)
|
assert like_num(word)
|
||||||
assert like_num(word.upper())
|
assert like_num(word.upper())
|
||||||
|
|
|
@ -4,13 +4,15 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
GA_TOKEN_EXCEPTION_TESTS = [
|
GA_TOKEN_EXCEPTION_TESTS = [
|
||||||
('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'Ó', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '(', 'lch.', '600', ')', '.']),
|
("Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).", ["Niall", "Ó", "Domhnaill", ",", "Rialtas", "na", "hÉireann", "1977", "(", "lch.", "600", ")", "."]),
|
||||||
('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise'])
|
("Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise", ["Daoine", "a", "bhfuil", "Gaeilge", "acu", ",", "m.sh.", "tusa", "agus", "mise"])
|
||||||
]
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', GA_TOKEN_EXCEPTION_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", GA_TOKEN_EXCEPTION_TESTS)
|
||||||
def test_ga_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens):
|
def test_ga_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens):
|
||||||
tokens = ga_tokenizer(text)
|
tokens = ga_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
|
|
@ -4,20 +4,41 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens',
|
@pytest.mark.parametrize(
|
||||||
[('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])])
|
"text,expected_tokens",
|
||||||
|
[("פייתון היא שפת תכנות דינמית", ["פייתון", "היא", "שפת", "תכנות", "דינמית"])],
|
||||||
|
)
|
||||||
def test_he_tokenizer_handles_abbreviation(he_tokenizer, text, expected_tokens):
|
def test_he_tokenizer_handles_abbreviation(he_tokenizer, text, expected_tokens):
|
||||||
tokens = he_tokenizer(text)
|
tokens = he_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
assert expected_tokens == token_list
|
assert expected_tokens == token_list
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', [
|
@pytest.mark.parametrize(
|
||||||
('עקבת אחריו בכל רחבי המדינה.', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '.']),
|
"text,expected_tokens",
|
||||||
('עקבת אחריו בכל רחבי המדינה?', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '?']),
|
[
|
||||||
('עקבת אחריו בכל רחבי המדינה!', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '!']),
|
(
|
||||||
('עקבת אחריו בכל רחבי המדינה..', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '..']),
|
"עקבת אחריו בכל רחבי המדינה.",
|
||||||
('עקבת אחריו בכל רחבי המדינה...', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '...'])])
|
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "."],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"עקבת אחריו בכל רחבי המדינה?",
|
||||||
|
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "?"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"עקבת אחריו בכל רחבי המדינה!",
|
||||||
|
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "!"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"עקבת אחריו בכל רחבי המדינה..",
|
||||||
|
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", ".."],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"עקבת אחריו בכל רחבי המדינה...",
|
||||||
|
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "..."],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_he_tokenizer_handles_punct(he_tokenizer, text, expected_tokens):
|
def test_he_tokenizer_handles_punct(he_tokenizer, text, expected_tokens):
|
||||||
tokens = he_tokenizer(text)
|
tokens = he_tokenizer(text)
|
||||||
assert expected_tokens == [token.text for token in tokens]
|
assert expected_tokens == [token.text for token in tokens]
|
||||||
|
|
|
@ -6,11 +6,11 @@ import pytest
|
||||||
|
|
||||||
DEFAULT_TESTS = [
|
DEFAULT_TESTS = [
|
||||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||||
pytest.mark.xfail(('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'])),
|
pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()),
|
||||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||||
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
|
||||||
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
|
||||||
pytest.mark.xfail(('A .hu.', ['A', '.hu', '.'])),
|
pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()),
|
||||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||||
('A pl.', ['A', 'pl.']),
|
('A pl.', ['A', 'pl.']),
|
||||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||||
|
@ -228,11 +228,11 @@ QUOTE_TESTS = [
|
||||||
|
|
||||||
DOT_TESTS = [
|
DOT_TESTS = [
|
||||||
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
|
||||||
pytest.mark.xfail(('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'])),
|
pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()),
|
||||||
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
|
||||||
('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']),
|
('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']),
|
||||||
('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']),
|
('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']),
|
||||||
pytest.mark.xfail(('A .hu.', ['A', '.hu', '.'])),
|
pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()),
|
||||||
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
|
||||||
('A pl.', ['A', 'pl.']),
|
('A pl.', ['A', 'pl.']),
|
||||||
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
|
||||||
|
|
|
@ -4,85 +4,87 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(Ma'arif)"])
|
@pytest.mark.parametrize("text", ["(Ma'arif)"])
|
||||||
def test_id_tokenizer_splits_no_special(id_tokenizer, text):
|
def test_id_tokenizer_splits_no_special(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Ma'arif"])
|
@pytest.mark.parametrize("text", ["Ma'arif"])
|
||||||
def test_id_tokenizer_splits_no_punct(id_tokenizer, text):
|
def test_id_tokenizer_splits_no_punct(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(Ma'arif"])
|
@pytest.mark.parametrize("text", ["(Ma'arif"])
|
||||||
def test_id_tokenizer_splits_prefix_punct(id_tokenizer, text):
|
def test_id_tokenizer_splits_prefix_punct(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Ma'arif)"])
|
@pytest.mark.parametrize("text", ["Ma'arif)"])
|
||||||
def test_id_tokenizer_splits_suffix_punct(id_tokenizer, text):
|
def test_id_tokenizer_splits_suffix_punct(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(Ma'arif)"])
|
@pytest.mark.parametrize("text", ["(Ma'arif)"])
|
||||||
def test_id_tokenizer_splits_even_wrap(id_tokenizer, text):
|
def test_id_tokenizer_splits_even_wrap(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(Ma'arif?)"])
|
@pytest.mark.parametrize("text", ["(Ma'arif?)"])
|
||||||
def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
|
def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
|
@pytest.mark.parametrize("text,length", [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
|
||||||
def test_id_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
|
def test_id_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["S.Kom.)"])
|
@pytest.mark.parametrize("text", ["S.Kom.)"])
|
||||||
def test_id_tokenizer_splits_suffix_interact(id_tokenizer, text):
|
def test_id_tokenizer_splits_suffix_interact(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(S.Kom.)"])
|
@pytest.mark.parametrize("text", ["(S.Kom.)"])
|
||||||
def test_id_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
|
def test_id_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(S.Kom.?)"])
|
@pytest.mark.parametrize("text", ["(S.Kom.?)"])
|
||||||
def test_id_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
|
def test_id_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)])
|
@pytest.mark.parametrize(
|
||||||
|
"text,length", [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)]
|
||||||
|
)
|
||||||
def test_id_tokenizer_splits_hyphens(id_tokenizer, text, length):
|
def test_id_tokenizer_splits_hyphens(id_tokenizer, text, length):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
|
||||||
def test_id_tokenizer_splits_numeric_range(id_tokenizer, text):
|
def test_id_tokenizer_splits_numeric_range(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["ini.Budi", "Halo.Bandung"])
|
@pytest.mark.parametrize("text", ["ini.Budi", "Halo.Bandung"])
|
||||||
def test_id_tokenizer_splits_period_infix(id_tokenizer, text):
|
def test_id_tokenizer_splits_period_infix(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Halo,Bandung", "satu,dua"])
|
@pytest.mark.parametrize("text", ["Halo,Bandung", "satu,dua"])
|
||||||
def test_id_tokenizer_splits_comma_infix(id_tokenizer, text):
|
def test_id_tokenizer_splits_comma_infix(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -91,7 +93,7 @@ def test_id_tokenizer_splits_comma_infix(id_tokenizer, text):
|
||||||
assert tokens[2].text == text.split(",")[1]
|
assert tokens[2].text == text.split(",")[1]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["halo...Bandung", "dia...pergi"])
|
@pytest.mark.parametrize("text", ["halo...Bandung", "dia...pergi"])
|
||||||
def test_id_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
|
def test_id_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
|
||||||
tokens = id_tokenizer(text)
|
tokens = id_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
|
@ -5,7 +5,7 @@ import pytest
|
||||||
from spacy.lang.id.lex_attrs import like_num
|
from spacy.lang.id.lex_attrs import like_num
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('word', ['sebelas'])
|
@pytest.mark.parametrize("word", ["sebelas"])
|
||||||
def test_id_lex_attrs_capitals(word):
|
def test_id_lex_attrs_capitals(word):
|
||||||
assert like_num(word)
|
assert like_num(word)
|
||||||
assert like_num(word.upper())
|
assert like_num(word.upper())
|
||||||
|
|
|
@ -4,12 +4,10 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('word,lemma', [
|
@pytest.mark.parametrize(
|
||||||
('新しく', '新しい'),
|
"word,lemma",
|
||||||
('赤く', '赤い'),
|
[("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "凄い"), ("いただきました", "頂く"), ("なった", "成る")],
|
||||||
('すごく', '凄い'),
|
)
|
||||||
('いただきました', '頂く'),
|
|
||||||
('なった', '成る')])
|
|
||||||
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
|
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
|
||||||
test_lemma = ja_tokenizer(word)[0].lemma_
|
test_lemma = ja_tokenizer(word)[0].lemma_
|
||||||
assert test_lemma == lemma
|
assert test_lemma == lemma
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
TOKENIZER_TESTS = [
|
TOKENIZER_TESTS = [
|
||||||
("日本語だよ", ['日本', '語', 'だ', 'よ']),
|
("日本語だよ", ['日本', '語', 'だ', 'よ']),
|
||||||
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
||||||
|
@ -27,21 +28,22 @@ POS_TESTS = [
|
||||||
('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
|
('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
|
||||||
('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
|
('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
|
||||||
]
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||||
def test_ja_tokenizer(ja_tokenizer, text, expected_tokens):
|
def test_ja_tokenizer(ja_tokenizer, text, expected_tokens):
|
||||||
tokens = [token.text for token in ja_tokenizer(text)]
|
tokens = [token.text for token in ja_tokenizer(text)]
|
||||||
assert tokens == expected_tokens
|
assert tokens == expected_tokens
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tags', TAG_TESTS)
|
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
|
||||||
def test_ja_tokenizer(ja_tokenizer, text, expected_tags):
|
def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
|
||||||
tags = [token.tag_ for token in ja_tokenizer(text)]
|
tags = [token.tag_ for token in ja_tokenizer(text)]
|
||||||
assert tags == expected_tags
|
assert tags == expected_tags
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_pos', POS_TESTS)
|
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
||||||
def test_ja_tokenizer(ja_tokenizer, text, expected_pos):
|
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
||||||
pos = [token.pos_ for token in ja_tokenizer(text)]
|
pos = [token.pos_ for token in ja_tokenizer(text)]
|
||||||
assert pos == expected_pos
|
assert pos == expected_pos
|
||||||
|
|
|
@ -5,12 +5,18 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
NB_TOKEN_EXCEPTION_TESTS = [
|
NB_TOKEN_EXCEPTION_TESTS = [
|
||||||
('Smørsausen brukes bl.a. til fisk', ['Smørsausen', 'brukes', 'bl.a.', 'til', 'fisk']),
|
(
|
||||||
('Jeg kommer først kl. 13 pga. diverse forsinkelser', ['Jeg', 'kommer', 'først', 'kl.', '13', 'pga.', 'diverse', 'forsinkelser'])
|
"Smørsausen brukes bl.a. til fisk",
|
||||||
|
["Smørsausen", "brukes", "bl.a.", "til", "fisk"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Jeg kommer først kl. 13 pga. diverse forsinkelser",
|
||||||
|
["Jeg", "kommer", "først", "kl.", "13", "pga.", "diverse", "forsinkelser"],
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', NB_TOKEN_EXCEPTION_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", NB_TOKEN_EXCEPTION_TESTS)
|
||||||
def test_nb_tokenizer_handles_exception_cases(nb_tokenizer, text, expected_tokens):
|
def test_nb_tokenizer_handles_exception_cases(nb_tokenizer, text, expected_tokens):
|
||||||
tokens = nb_tokenizer(text)
|
tokens = nb_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
|
|
@ -5,7 +5,7 @@ import pytest
|
||||||
from spacy.lang.nl.lex_attrs import like_num
|
from spacy.lang.nl.lex_attrs import like_num
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('word', ['elf', 'elfde'])
|
@pytest.mark.parametrize("word", ["elf", "elfde"])
|
||||||
def test_nl_lex_attrs_capitals(word):
|
def test_nl_lex_attrs_capitals(word):
|
||||||
assert like_num(word)
|
assert like_num(word)
|
||||||
assert like_num(word.upper())
|
assert like_num(word.upper())
|
||||||
|
|
|
@ -5,7 +5,7 @@ import pytest
|
||||||
from spacy.lang.pt.lex_attrs import like_num
|
from spacy.lang.pt.lex_attrs import like_num
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('word', ['onze', 'quadragésimo'])
|
@pytest.mark.parametrize("word", ["onze", "quadragésimo"])
|
||||||
def test_pt_lex_attrs_capitals(word):
|
def test_pt_lex_attrs_capitals(word):
|
||||||
assert like_num(word)
|
assert like_num(word)
|
||||||
assert like_num(word.upper())
|
assert like_num(word.upper())
|
||||||
|
|
|
@ -4,11 +4,15 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string,lemma', [
|
@pytest.mark.parametrize(
|
||||||
('câini', 'câine'),
|
"string,lemma",
|
||||||
('expedițiilor', 'expediție'),
|
[
|
||||||
('pensete', 'pensetă'),
|
("câini", "câine"),
|
||||||
('erau', 'fi')])
|
("expedițiilor", "expediție"),
|
||||||
|
("pensete", "pensetă"),
|
||||||
|
("erau", "fi"),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_ro_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma):
|
def test_ro_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma):
|
||||||
tokens = ro_tokenizer(string)
|
tokens = ro_tokenizer(string)
|
||||||
assert tokens[0].lemma_ == lemma
|
assert tokens[0].lemma_ == lemma
|
||||||
|
|
|
@ -5,17 +5,20 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
TEST_CASES = [
|
TEST_CASES = [
|
||||||
('Adresa este str. Principală nr. 5.', ['Adresa', 'este', 'str.', 'Principală', 'nr.', '5', '.']),
|
(
|
||||||
('Teste, etc.', ['Teste', ',', 'etc.']),
|
"Adresa este str. Principală nr. 5.",
|
||||||
('Lista, ș.a.m.d.', ['Lista', ',', 'ș.a.m.d.']),
|
["Adresa", "este", "str.", "Principală", "nr.", "5", "."],
|
||||||
('Și d.p.d.v. al...', ['Și', 'd.p.d.v.', 'al', '...']),
|
),
|
||||||
|
("Teste, etc.", ["Teste", ",", "etc."]),
|
||||||
|
("Lista, ș.a.m.d.", ["Lista", ",", "ș.a.m.d."]),
|
||||||
|
("Și d.p.d.v. al...", ["Și", "d.p.d.v.", "al", "..."]),
|
||||||
# number tests
|
# number tests
|
||||||
('Clasa a 4-a.', ['Clasa', 'a', '4-a', '.']),
|
("Clasa a 4-a.", ["Clasa", "a", "4-a", "."]),
|
||||||
('Al 12-lea ceas.', ['Al', '12-lea', 'ceas', '.'])
|
("Al 12-lea ceas.", ["Al", "12-lea", "ceas", "."]),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', TEST_CASES)
|
@pytest.mark.parametrize("text,expected_tokens", TEST_CASES)
|
||||||
def test_ro_tokenizer_handles_testcases(ro_tokenizer, text, expected_tokens):
|
def test_ro_tokenizer_handles_testcases(ro_tokenizer, text, expected_tokens):
|
||||||
tokens = ro_tokenizer(text)
|
tokens = ro_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
|
|
@ -4,10 +4,10 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,norms', [
|
@pytest.mark.parametrize(
|
||||||
("пн.", ["понедельник"]),
|
"text,norms",
|
||||||
("пт.", ["пятница"]),
|
[("пн.", ["понедельник"]), ("пт.", ["пятница"]), ("дек.", ["декабрь"])],
|
||||||
("дек.", ["декабрь"])])
|
)
|
||||||
def test_ru_tokenizer_abbrev_exceptions(ru_tokenizer, text, norms):
|
def test_ru_tokenizer_abbrev_exceptions(ru_tokenizer, text, norms):
|
||||||
tokens = ru_tokenizer(text)
|
tokens = ru_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
|
@ -9,55 +9,71 @@ from ...util import get_doc
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def ru_lemmatizer():
|
def ru_lemmatizer():
|
||||||
pymorphy = pytest.importorskip('pymorphy2')
|
pymorphy = pytest.importorskip("pymorphy2")
|
||||||
return Russian.Defaults.create_lemmatizer()
|
return Russian.Defaults.create_lemmatizer()
|
||||||
|
|
||||||
|
|
||||||
def test_ru_doc_lemmatization(ru_tokenizer):
|
def test_ru_doc_lemmatization(ru_tokenizer):
|
||||||
words = ['мама', 'мыла', 'раму']
|
words = ["мама", "мыла", "раму"]
|
||||||
tags = ['NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing',
|
tags = [
|
||||||
'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act',
|
"NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
|
||||||
'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing']
|
"VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
|
||||||
|
"NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
|
||||||
|
]
|
||||||
doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags)
|
doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags)
|
||||||
lemmas = [token.lemma_ for token in doc]
|
lemmas = [token.lemma_ for token in doc]
|
||||||
assert lemmas == ['мама', 'мыть', 'рама']
|
assert lemmas == ["мама", "мыть", "рама"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,lemmas', [
|
@pytest.mark.parametrize(
|
||||||
('гвоздики', ['гвоздик', 'гвоздика']),
|
"text,lemmas",
|
||||||
('люди', ['человек']),
|
[
|
||||||
('реки', ['река']),
|
("гвоздики", ["гвоздик", "гвоздика"]),
|
||||||
('кольцо', ['кольцо']),
|
("люди", ["человек"]),
|
||||||
('пепперони', ['пепперони'])])
|
("реки", ["река"]),
|
||||||
|
("кольцо", ["кольцо"]),
|
||||||
|
("пепперони", ["пепперони"]),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
|
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
|
||||||
assert sorted(ru_lemmatizer.noun(text)) == lemmas
|
assert sorted(ru_lemmatizer.noun(text)) == lemmas
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models('ru')
|
@pytest.mark.models("ru")
|
||||||
@pytest.mark.parametrize('text,pos,morphology,lemma', [
|
@pytest.mark.parametrize(
|
||||||
('рой', 'NOUN', None, 'рой'),
|
"text,pos,morphology,lemma",
|
||||||
('рой', 'VERB', None, 'рыть'),
|
[
|
||||||
('клей', 'NOUN', None, 'клей'),
|
("рой", "NOUN", None, "рой"),
|
||||||
('клей', 'VERB', None, 'клеить'),
|
("рой", "VERB", None, "рыть"),
|
||||||
('три', 'NUM', None, 'три'),
|
("клей", "NOUN", None, "клей"),
|
||||||
('кос', 'NOUN', {'Number': 'Sing'}, 'кос'),
|
("клей", "VERB", None, "клеить"),
|
||||||
('кос', 'NOUN', {'Number': 'Plur'}, 'коса'),
|
("три", "NUM", None, "три"),
|
||||||
('кос', 'ADJ', None, 'косой'),
|
("кос", "NOUN", {"Number": "Sing"}, "кос"),
|
||||||
('потом', 'NOUN', None, 'пот'),
|
("кос", "NOUN", {"Number": "Plur"}, "коса"),
|
||||||
('потом', 'ADV', None, 'потом')])
|
("кос", "ADJ", None, "косой"),
|
||||||
def test_ru_lemmatizer_works_with_different_pos_homonyms(ru_lemmatizer, text, pos, morphology, lemma):
|
("потом", "NOUN", None, "пот"),
|
||||||
|
("потом", "ADV", None, "потом"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_ru_lemmatizer_works_with_different_pos_homonyms(
|
||||||
|
ru_lemmatizer, text, pos, morphology, lemma
|
||||||
|
):
|
||||||
assert ru_lemmatizer(text, pos, morphology) == [lemma]
|
assert ru_lemmatizer(text, pos, morphology) == [lemma]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,morphology,lemma', [
|
@pytest.mark.parametrize(
|
||||||
('гвоздики', {'Gender': 'Fem'}, 'гвоздика'),
|
"text,morphology,lemma",
|
||||||
('гвоздики', {'Gender': 'Masc'}, 'гвоздик'),
|
[
|
||||||
('вина', {'Gender': 'Fem'}, 'вина'),
|
("гвоздики", {"Gender": "Fem"}, "гвоздика"),
|
||||||
('вина', {'Gender': 'Neut'}, 'вино')])
|
("гвоздики", {"Gender": "Masc"}, "гвоздик"),
|
||||||
|
("вина", {"Gender": "Fem"}, "вина"),
|
||||||
|
("вина", {"Gender": "Neut"}, "вино"),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
|
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
|
||||||
assert ru_lemmatizer.noun(text, morphology) == [lemma]
|
assert ru_lemmatizer.noun(text, morphology) == [lemma]
|
||||||
|
|
||||||
|
|
||||||
def test_ru_lemmatizer_punct(ru_lemmatizer):
|
def test_ru_lemmatizer_punct(ru_lemmatizer):
|
||||||
assert ru_lemmatizer.punct('«') == ['"']
|
assert ru_lemmatizer.punct("«") == ['"']
|
||||||
assert ru_lemmatizer.punct('»') == ['"']
|
assert ru_lemmatizer.punct("»") == ['"']
|
||||||
|
|
|
@ -5,7 +5,7 @@ import pytest
|
||||||
from spacy.lang.ru.lex_attrs import like_num
|
from spacy.lang.ru.lex_attrs import like_num
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('word', ['одиннадцать'])
|
@pytest.mark.parametrize("word", ["одиннадцать"])
|
||||||
def test_ru_lex_attrs_capitals(word):
|
def test_ru_lex_attrs_capitals(word):
|
||||||
assert like_num(word)
|
assert like_num(word)
|
||||||
assert like_num(word.upper())
|
assert like_num(word.upper())
|
||||||
|
|
|
@ -4,19 +4,19 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
PUNCT_OPEN = ['(', '[', '{', '*']
|
PUNCT_OPEN = ["(", "[", "{", "*"]
|
||||||
PUNCT_CLOSE = [')', ']', '}', '*']
|
PUNCT_CLOSE = [")", "]", "}", "*"]
|
||||||
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
@pytest.mark.parametrize("text", ["(", "((", "<"])
|
||||||
def test_ru_tokenizer_handles_only_punct(ru_tokenizer, text):
|
def test_ru_tokenizer_handles_only_punct(ru_tokenizer, text):
|
||||||
tokens = ru_tokenizer(text)
|
tokens = ru_tokenizer(text)
|
||||||
assert len(tokens) == len(text)
|
assert len(tokens) == len(text)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('text', ["Привет"])
|
@pytest.mark.parametrize("text", ["Привет"])
|
||||||
def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text):
|
def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text):
|
||||||
tokens = ru_tokenizer(punct + text)
|
tokens = ru_tokenizer(punct + text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -24,8 +24,8 @@ def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text):
|
||||||
assert tokens[1].text == text
|
assert tokens[1].text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('text', ["Привет"])
|
@pytest.mark.parametrize("text", ["Привет"])
|
||||||
def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text):
|
def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text):
|
||||||
tokens = ru_tokenizer(text + punct)
|
tokens = ru_tokenizer(text + punct)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -33,9 +33,9 @@ def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text):
|
||||||
assert tokens[1].text == punct
|
assert tokens[1].text == punct
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('punct_add', ["`"])
|
@pytest.mark.parametrize("punct_add", ["`"])
|
||||||
@pytest.mark.parametrize('text', ["Привет"])
|
@pytest.mark.parametrize("text", ["Привет"])
|
||||||
def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, text):
|
def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, text):
|
||||||
tokens = ru_tokenizer(punct + punct_add + text)
|
tokens = ru_tokenizer(punct + punct_add + text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -44,9 +44,9 @@ def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add,
|
||||||
assert tokens[2].text == text
|
assert tokens[2].text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('punct_add', ["'"])
|
@pytest.mark.parametrize("punct_add", ["'"])
|
||||||
@pytest.mark.parametrize('text', ["Привет"])
|
@pytest.mark.parametrize("text", ["Привет"])
|
||||||
def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add, text):
|
def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add, text):
|
||||||
tokens = ru_tokenizer(text + punct + punct_add)
|
tokens = ru_tokenizer(text + punct + punct_add)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -55,8 +55,8 @@ def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add
|
||||||
assert tokens[2].text == punct_add
|
assert tokens[2].text == punct_add
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize("punct", PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('text', ["Привет"])
|
@pytest.mark.parametrize("text", ["Привет"])
|
||||||
def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text):
|
def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text):
|
||||||
tokens = ru_tokenizer(punct + punct + punct + text)
|
tokens = ru_tokenizer(punct + punct + punct + text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
@ -64,8 +64,8 @@ def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text):
|
||||||
assert tokens[3].text == text
|
assert tokens[3].text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('text', ["Привет"])
|
@pytest.mark.parametrize("text", ["Привет"])
|
||||||
def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text):
|
def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text):
|
||||||
tokens = ru_tokenizer(text + punct + punct + punct)
|
tokens = ru_tokenizer(text + punct + punct + punct)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
|
@ -73,14 +73,14 @@ def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text):
|
||||||
assert tokens[1].text == punct
|
assert tokens[1].text == punct
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["'Тест"])
|
@pytest.mark.parametrize("text", ["'Тест"])
|
||||||
def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text):
|
def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text):
|
||||||
tokens = ru_tokenizer(text)
|
tokens = ru_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].text == "'"
|
assert tokens[0].text == "'"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Тест''"])
|
@pytest.mark.parametrize("text", ["Тест''"])
|
||||||
def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
|
def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
|
||||||
tokens = ru_tokenizer(text)
|
tokens = ru_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
@ -88,10 +88,11 @@ def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
|
||||||
assert len(tokens_punct) == 1
|
assert len(tokens_punct) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('text', ["Тест"])
|
@pytest.mark.parametrize("text", ["Тест"])
|
||||||
def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open,
|
def test_ru_tokenizer_splits_open_close_punct(
|
||||||
punct_close, text):
|
ru_tokenizer, punct_open, punct_close, text
|
||||||
|
):
|
||||||
tokens = ru_tokenizer(punct_open + text + punct_close)
|
tokens = ru_tokenizer(punct_open + text + punct_close)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[0].text == punct_open
|
assert tokens[0].text == punct_open
|
||||||
|
@ -99,11 +100,12 @@ def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open,
|
||||||
assert tokens[2].text == punct_close
|
assert tokens[2].text == punct_close
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
|
||||||
@pytest.mark.parametrize('text', ["Тест"])
|
@pytest.mark.parametrize("text", ["Тест"])
|
||||||
def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close,
|
def test_ru_tokenizer_two_diff_punct(
|
||||||
punct_open2, punct_close2, text):
|
ru_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
|
||||||
|
):
|
||||||
tokens = ru_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
tokens = ru_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[0].text == punct_open2
|
assert tokens[0].text == punct_open2
|
||||||
|
@ -113,7 +115,7 @@ def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close,
|
||||||
assert tokens[4].text == punct_close2
|
assert tokens[4].text == punct_close2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Тест."])
|
@pytest.mark.parametrize("text", ["Тест."])
|
||||||
def test_ru_tokenizer_splits_trailing_dot(ru_tokenizer, text):
|
def test_ru_tokenizer_splits_trailing_dot(ru_tokenizer, text):
|
||||||
tokens = ru_tokenizer(text)
|
tokens = ru_tokenizer(text)
|
||||||
assert tokens[1].text == "."
|
assert tokens[1].text == "."
|
||||||
|
|
|
@ -5,20 +5,29 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
SV_TOKEN_EXCEPTION_TESTS = [
|
SV_TOKEN_EXCEPTION_TESTS = [
|
||||||
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
(
|
||||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
|
"Smörsåsen används bl.a. till fisk",
|
||||||
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
|
["Smörsåsen", "används", "bl.a.", "till", "fisk"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Jag kommer först kl. 13 p.g.a. diverse förseningar",
|
||||||
|
["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Anders I. tycker om ord med i i.",
|
||||||
|
["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."],
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS)
|
||||||
def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens):
|
def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
assert expected_tokens == token_list
|
assert expected_tokens == token_list
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["driveru", "hajaru", "Serru", "Fixaru"])
|
@pytest.mark.parametrize("text", ["driveru", "hajaru", "Serru", "Fixaru"])
|
||||||
def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text):
|
def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
|
|
|
@ -6,53 +6,85 @@ from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA
|
||||||
from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape
|
from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["dog"])
|
@pytest.mark.parametrize("text", ["dog"])
|
||||||
def test_attrs_key(text):
|
def test_attrs_key(text):
|
||||||
assert intify_attrs({"ORTH": text}) == {ORTH: text}
|
assert intify_attrs({"ORTH": text}) == {ORTH: text}
|
||||||
assert intify_attrs({"NORM": text}) == {NORM: text}
|
assert intify_attrs({"NORM": text}) == {NORM: text}
|
||||||
assert intify_attrs({"lemma": text}, strings_map={text: 10}) == {LEMMA: 10}
|
assert intify_attrs({"lemma": text}, strings_map={text: 10}) == {LEMMA: 10}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["dog"])
|
@pytest.mark.parametrize("text", ["dog"])
|
||||||
def test_attrs_idempotence(text):
|
def test_attrs_idempotence(text):
|
||||||
int_attrs = intify_attrs({"lemma": text, 'is_alpha': True}, strings_map={text: 10})
|
int_attrs = intify_attrs({"lemma": text, "is_alpha": True}, strings_map={text: 10})
|
||||||
assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
|
assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["dog"])
|
@pytest.mark.parametrize("text", ["dog"])
|
||||||
def test_attrs_do_deprecated(text):
|
def test_attrs_do_deprecated(text):
|
||||||
int_attrs = intify_attrs({"F": text, 'is_alpha': True}, strings_map={text: 10},
|
int_attrs = intify_attrs(
|
||||||
_do_deprecated=True)
|
{"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
|
||||||
|
)
|
||||||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,match', [(',', True), (' ', False), ('a', False)])
|
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
|
||||||
def test_lex_attrs_is_punct(text, match):
|
def test_lex_attrs_is_punct(text, match):
|
||||||
assert is_punct(text) == match
|
assert is_punct(text) == match
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,match', [(',', True), ('£', False), ('♥', False)])
|
@pytest.mark.parametrize("text,match", [(",", True), ("£", False), ("♥", False)])
|
||||||
def test_lex_attrs_is_ascii(text, match):
|
def test_lex_attrs_is_ascii(text, match):
|
||||||
assert is_ascii(text) == match
|
assert is_ascii(text) == match
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,match', [('$', True), ('£', True), ('♥', False),
|
@pytest.mark.parametrize(
|
||||||
('€', True), ('¥', True), ('¢', True),
|
"text,match",
|
||||||
('a', False), ('www.google.com', False), ('dog', False)])
|
[
|
||||||
|
("$", True),
|
||||||
|
("£", True),
|
||||||
|
("♥", False),
|
||||||
|
("€", True),
|
||||||
|
("¥", True),
|
||||||
|
("¢", True),
|
||||||
|
("a", False),
|
||||||
|
("www.google.com", False),
|
||||||
|
("dog", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_lex_attrs_is_currency(text, match):
|
def test_lex_attrs_is_currency(text, match):
|
||||||
assert is_currency(text) == match
|
assert is_currency(text) == match
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,match', [
|
@pytest.mark.parametrize(
|
||||||
('www.google.com', True), ('google.com', True), ('sydney.com', True),
|
"text,match",
|
||||||
('2girls1cup.org', True), ('http://stupid', True), ('www.hi', True),
|
[
|
||||||
('dog', False), ('1.2', False), ('1.a', False), ('hello.There', False)])
|
("www.google.com", True),
|
||||||
|
("google.com", True),
|
||||||
|
("sydney.com", True),
|
||||||
|
("2girls1cup.org", True),
|
||||||
|
("http://stupid", True),
|
||||||
|
("www.hi", True),
|
||||||
|
("dog", False),
|
||||||
|
("1.2", False),
|
||||||
|
("1.a", False),
|
||||||
|
("hello.There", False),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_lex_attrs_like_url(text, match):
|
def test_lex_attrs_like_url(text, match):
|
||||||
assert like_url(text) == match
|
assert like_url(text) == match
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,shape', [
|
@pytest.mark.parametrize(
|
||||||
('Nasa', 'Xxxx'), ('capitalized', 'xxxx'), ('999999999', 'dddd'),
|
"text,shape",
|
||||||
('C3P0', 'XdXd'), (',', ','), ('\n', '\n'), ('``,-', '``,-')])
|
[
|
||||||
|
("Nasa", "Xxxx"),
|
||||||
|
("capitalized", "xxxx"),
|
||||||
|
("999999999", "dddd"),
|
||||||
|
("C3P0", "XdXd"),
|
||||||
|
(",", ","),
|
||||||
|
("\n", "\n"),
|
||||||
|
("``,-", "``,-"),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_lex_attrs_word_shape(text, shape):
|
def test_lex_attrs_word_shape(text, shape):
|
||||||
assert word_shape(text) == shape
|
assert word_shape(text) == shape
|
||||||
|
|
|
@ -4,8 +4,9 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', [
|
@pytest.mark.parametrize(
|
||||||
("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])])
|
"text,expected_tokens", [("คุณรักผมไหม", ["คุณ", "รัก", "ผม", "ไหม"])]
|
||||||
|
)
|
||||||
def test_th_tokenizer(th_tokenizer, text, expected_tokens):
|
def test_th_tokenizer(th_tokenizer, text, expected_tokens):
|
||||||
tokens = [token.text for token in th_tokenizer(text)]
|
tokens = [token.text for token in th_tokenizer(text)]
|
||||||
assert tokens == expected_tokens
|
assert tokens == expected_tokens
|
||||||
|
|
|
@ -4,14 +4,18 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string,lemma', [
|
@pytest.mark.parametrize(
|
||||||
('evlerimizdeki', 'ev'),
|
"string,lemma",
|
||||||
('işlerimizi', 'iş'),
|
[
|
||||||
('biran', 'biran'),
|
("evlerimizdeki", "ev"),
|
||||||
('bitirmeliyiz', 'bitir'),
|
("işlerimizi", "iş"),
|
||||||
('isteklerimizi', 'istek'),
|
("biran", "biran"),
|
||||||
('karşılaştırmamızın', 'karşılaştır'),
|
("bitirmeliyiz", "bitir"),
|
||||||
('çoğulculuktan', 'çoğulcu')])
|
("isteklerimizi", "istek"),
|
||||||
|
("karşılaştırmamızın", "karşılaştır"),
|
||||||
|
("çoğulculuktan", "çoğulcu"),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_tr_lemmatizer_lookup_assigns(tr_tokenizer, string, lemma):
|
def test_tr_lemmatizer_lookup_assigns(tr_tokenizer, string, lemma):
|
||||||
tokens = tr_tokenizer(string)
|
tokens = tr_tokenizer(string)
|
||||||
assert tokens[0].lemma_ == lemma
|
assert tokens[0].lemma_ == lemma
|
||||||
|
|
|
@ -6,14 +6,16 @@ import pytest
|
||||||
|
|
||||||
INFIX_HYPHEN_TESTS = [
|
INFIX_HYPHEN_TESTS = [
|
||||||
("Явым-төшем күләме.", "Явым-төшем күләме .".split()),
|
("Явым-төшем күләме.", "Явым-төшем күләме .".split()),
|
||||||
("Хатын-кыз киеме.", "Хатын-кыз киеме .".split())
|
("Хатын-кыз киеме.", "Хатын-кыз киеме .".split()),
|
||||||
]
|
]
|
||||||
|
|
||||||
PUNC_INSIDE_WORDS_TESTS = [
|
PUNC_INSIDE_WORDS_TESTS = [
|
||||||
("Пассаҗир саны - 2,13 млн — кеше/көндә (2010), 783,9 млн. кеше/елда.",
|
(
|
||||||
"Пассаҗир саны - 2,13 млн — кеше / көндә ( 2010 ) ,"
|
"Пассаҗир саны - 2,13 млн — кеше/көндә (2010), 783,9 млн. кеше/елда.",
|
||||||
" 783,9 млн. кеше / елда .".split()),
|
"Пассаҗир саны - 2,13 млн — кеше / көндә ( 2010 ) ,"
|
||||||
("Ту\"кай", "Ту \" кай".split())
|
" 783,9 млн. кеше / елда .".split(),
|
||||||
|
),
|
||||||
|
('Ту"кай', 'Ту " кай'.split()),
|
||||||
]
|
]
|
||||||
|
|
||||||
MIXED_ORDINAL_NUMS_TESTS = [
|
MIXED_ORDINAL_NUMS_TESTS = [
|
||||||
|
@ -22,14 +24,14 @@ MIXED_ORDINAL_NUMS_TESTS = [
|
||||||
|
|
||||||
ABBREV_TESTS = [
|
ABBREV_TESTS = [
|
||||||
("«3 елда (б.э.к.) туган", "« 3 елда ( б.э.к. ) туган".split()),
|
("«3 елда (б.э.к.) туган", "« 3 елда ( б.э.к. ) туган".split()),
|
||||||
("тукымадан һ.б.ш. тегелгән.", "тукымадан һ.б.ш. тегелгән .".split())
|
("тукымадан һ.б.ш. тегелгән.", "тукымадан һ.б.ш. тегелгән .".split()),
|
||||||
]
|
]
|
||||||
|
|
||||||
NAME_ABBREV_TESTS = [
|
NAME_ABBREV_TESTS = [
|
||||||
("Ә.Тукай", "Ә.Тукай".split()),
|
("Ә.Тукай", "Ә.Тукай".split()),
|
||||||
("Ә.тукай", "Ә.тукай".split()),
|
("Ә.тукай", "Ә.тукай".split()),
|
||||||
("ә.Тукай", "ә . Тукай".split()),
|
("ә.Тукай", "ә . Тукай".split()),
|
||||||
("Миләүшә.", "Миләүшә .".split())
|
("Миләүшә.", "Миләүшә .".split()),
|
||||||
]
|
]
|
||||||
|
|
||||||
TYPOS_IN_PUNC_TESTS = [
|
TYPOS_IN_PUNC_TESTS = [
|
||||||
|
@ -37,30 +39,39 @@ TYPOS_IN_PUNC_TESTS = [
|
||||||
("«3 елда,туган", "« 3 елда , туган".split()),
|
("«3 елда,туган", "« 3 елда , туган".split()),
|
||||||
("«3 елда,туган.", "« 3 елда , туган .".split()),
|
("«3 елда,туган.", "« 3 елда , туган .".split()),
|
||||||
("Ул эшли(кайчан?)", "Ул эшли ( кайчан ? )".split()),
|
("Ул эшли(кайчан?)", "Ул эшли ( кайчан ? )".split()),
|
||||||
("Ул (кайчан?)эшли", "Ул ( кайчан ?) эшли".split()) # "?)" => "?)" or "? )"
|
("Ул (кайчан?)эшли", "Ул ( кайчан ?) эшли".split()), # "?)" => "?)" or "? )"
|
||||||
]
|
]
|
||||||
|
|
||||||
LONG_TEXTS_TESTS = [
|
LONG_TEXTS_TESTS = [
|
||||||
("Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы"
|
(
|
||||||
"якларда яшәгәннәр, шуңа күрә аларга кием кирәк булмаган.Йөз"
|
"Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы"
|
||||||
"меңнәрчә еллар үткән, борынгы кешеләр акрынлап Европа һәм Азиянең"
|
"якларда яшәгәннәр, шуңа күрә аларга кием кирәк булмаган.Йөз"
|
||||||
"салкын илләрендә дә яши башлаганнар. Алар кырыс һәм салкын"
|
"меңнәрчә еллар үткән, борынгы кешеләр акрынлап Европа һәм Азиянең"
|
||||||
"кышлардан саклану өчен кием-салым уйлап тапканнар - итәк.",
|
"салкын илләрендә дә яши башлаганнар. Алар кырыс һәм салкын"
|
||||||
"Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы"
|
"кышлардан саклану өчен кием-салым уйлап тапканнар - итәк.",
|
||||||
"якларда яшәгәннәр , шуңа күрә аларга кием кирәк булмаган . Йөз"
|
"Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы"
|
||||||
"меңнәрчә еллар үткән , борынгы кешеләр акрынлап Европа һәм Азиянең"
|
"якларда яшәгәннәр , шуңа күрә аларга кием кирәк булмаган . Йөз"
|
||||||
"салкын илләрендә дә яши башлаганнар . Алар кырыс һәм салкын"
|
"меңнәрчә еллар үткән , борынгы кешеләр акрынлап Европа һәм Азиянең"
|
||||||
"кышлардан саклану өчен кием-салым уйлап тапканнар - итәк .".split()
|
"салкын илләрендә дә яши башлаганнар . Алар кырыс һәм салкын"
|
||||||
)
|
"кышлардан саклану өчен кием-салым уйлап тапканнар - итәк .".split(),
|
||||||
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
TESTCASES = (INFIX_HYPHEN_TESTS + PUNC_INSIDE_WORDS_TESTS +
|
TESTCASES = (
|
||||||
MIXED_ORDINAL_NUMS_TESTS + ABBREV_TESTS + NAME_ABBREV_TESTS +
|
INFIX_HYPHEN_TESTS
|
||||||
LONG_TEXTS_TESTS + TYPOS_IN_PUNC_TESTS)
|
+ PUNC_INSIDE_WORDS_TESTS
|
||||||
|
+ MIXED_ORDINAL_NUMS_TESTS
|
||||||
|
+ ABBREV_TESTS
|
||||||
|
+ NAME_ABBREV_TESTS
|
||||||
|
+ LONG_TEXTS_TESTS
|
||||||
|
+ TYPOS_IN_PUNC_TESTS
|
||||||
|
)
|
||||||
|
|
||||||
NORM_TESTCASES = [
|
NORM_TESTCASES = [
|
||||||
("тукымадан һ.б.ш. тегелгән.",
|
(
|
||||||
["тукымадан", "һәм башка шундыйлар", "тегелгән", "."])
|
"тукымадан һ.б.ш. тегелгән.",
|
||||||
|
["тукымадан", "һәм башка шундыйлар", "тегелгән", "."],
|
||||||
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,7 +81,7 @@ def test_tt_tokenizer_handles_testcases(tt_tokenizer, text, expected_tokens):
|
||||||
assert expected_tokens == tokens
|
assert expected_tokens == tokens
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,norms', NORM_TESTCASES)
|
@pytest.mark.parametrize("text,norms", NORM_TESTCASES)
|
||||||
def test_tt_tokenizer_handles_norm_exceptions(tt_tokenizer, text, norms):
|
def test_tt_tokenizer_handles_norm_exceptions(tt_tokenizer, text, norms):
|
||||||
tokens = tt_tokenizer(text)
|
tokens = tt_tokenizer(text)
|
||||||
assert [token.norm_ for token in tokens] == norms
|
assert [token.norm_ for token in tokens] == norms
|
||||||
|
|
|
@ -13,9 +13,7 @@ def test_ur_tokenizer_handles_long_text(ur_tokenizer):
|
||||||
assert len(tokens) == 77
|
assert len(tokens) == 77
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [
|
@pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)])
|
||||||
("تحریر باسط حبیب", 3),
|
|
||||||
("میرا پاکستان", 2)])
|
|
||||||
def test_ur_tokenizer_handles_cnts(ur_tokenizer, text, length):
|
def test_ur_tokenizer_handles_cnts(ur_tokenizer, text, length):
|
||||||
tokens = ur_tokenizer(text)
|
tokens = ur_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
|
@ -10,9 +10,11 @@ from ..util import get_doc
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def matcher(en_vocab):
|
def matcher(en_vocab):
|
||||||
rules = {'JS': [[{'ORTH': 'JavaScript'}]],
|
rules = {
|
||||||
'GoogleNow': [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]],
|
"JS": [[{"ORTH": "JavaScript"}]],
|
||||||
'Java': [[{'LOWER': 'java'}]]}
|
"GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
|
||||||
|
"Java": [[{"LOWER": "java"}]],
|
||||||
|
}
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
for key, patterns in rules.items():
|
for key, patterns in rules.items():
|
||||||
matcher.add(key, None, *patterns)
|
matcher.add(key, None, *patterns)
|
||||||
|
@ -21,44 +23,44 @@ def matcher(en_vocab):
|
||||||
|
|
||||||
def test_matcher_from_api_docs(en_vocab):
|
def test_matcher_from_api_docs(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{'ORTH': 'test'}]
|
pattern = [{"ORTH": "test"}]
|
||||||
assert len(matcher) == 0
|
assert len(matcher) == 0
|
||||||
matcher.add('Rule', None, pattern)
|
matcher.add("Rule", None, pattern)
|
||||||
assert len(matcher) == 1
|
assert len(matcher) == 1
|
||||||
matcher.remove('Rule')
|
matcher.remove("Rule")
|
||||||
assert 'Rule' not in matcher
|
assert "Rule" not in matcher
|
||||||
matcher.add('Rule', None, pattern)
|
matcher.add("Rule", None, pattern)
|
||||||
assert 'Rule' in matcher
|
assert "Rule" in matcher
|
||||||
on_match, patterns = matcher.get('Rule')
|
on_match, patterns = matcher.get("Rule")
|
||||||
assert len(patterns[0])
|
assert len(patterns[0])
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_from_usage_docs(en_vocab):
|
def test_matcher_from_usage_docs(en_vocab):
|
||||||
text = "Wow 😀 This is really cool! 😂 😂"
|
text = "Wow 😀 This is really cool! 😂 😂"
|
||||||
doc = Doc(en_vocab, words=text.split(' '))
|
doc = Doc(en_vocab, words=text.split(" "))
|
||||||
pos_emoji = ['😀', '😃', '😂', '🤣', '😊', '😍']
|
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]
|
||||||
pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
|
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
|
||||||
|
|
||||||
def label_sentiment(matcher, doc, i, matches):
|
def label_sentiment(matcher, doc, i, matches):
|
||||||
match_id, start, end = matches[i]
|
match_id, start, end = matches[i]
|
||||||
if doc.vocab.strings[match_id] == 'HAPPY':
|
if doc.vocab.strings[match_id] == "HAPPY":
|
||||||
doc.sentiment += 0.1
|
doc.sentiment += 0.1
|
||||||
span = doc[start : end]
|
span = doc[start:end]
|
||||||
token = span.merge()
|
token = span.merge()
|
||||||
token.vocab[token.text].norm_ = 'happy emoji'
|
token.vocab[token.text].norm_ = "happy emoji"
|
||||||
|
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add('HAPPY', label_sentiment, *pos_patterns)
|
matcher.add("HAPPY", label_sentiment, *pos_patterns)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert doc.sentiment != 0
|
assert doc.sentiment != 0
|
||||||
assert doc[1].norm_ == 'happy emoji'
|
assert doc[1].norm_ == "happy emoji"
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_len_contains(matcher):
|
def test_matcher_len_contains(matcher):
|
||||||
assert len(matcher) == 3
|
assert len(matcher) == 3
|
||||||
matcher.add('TEST', None, [{'ORTH': 'test'}])
|
matcher.add("TEST", None, [{"ORTH": "test"}])
|
||||||
assert 'TEST' in matcher
|
assert "TEST" in matcher
|
||||||
assert 'TEST2' not in matcher
|
assert "TEST2" not in matcher
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_no_match(matcher):
|
def test_matcher_no_match(matcher):
|
||||||
|
@ -68,38 +70,40 @@ def test_matcher_no_match(matcher):
|
||||||
|
|
||||||
def test_matcher_match_start(matcher):
|
def test_matcher_match_start(matcher):
|
||||||
doc = Doc(matcher.vocab, words=["JavaScript", "is", "good"])
|
doc = Doc(matcher.vocab, words=["JavaScript", "is", "good"])
|
||||||
assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
|
assert matcher(doc) == [(matcher.vocab.strings["JS"], 0, 1)]
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_end(matcher):
|
def test_matcher_match_end(matcher):
|
||||||
words = ["I", "like", "java"]
|
words = ["I", "like", "java"]
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(matcher.vocab, words=words)
|
||||||
assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
|
assert matcher(doc) == [(doc.vocab.strings["Java"], 2, 3)]
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_middle(matcher):
|
def test_matcher_match_middle(matcher):
|
||||||
words = ["I", "like", "Google", "Now", "best"]
|
words = ["I", "like", "Google", "Now", "best"]
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(matcher.vocab, words=words)
|
||||||
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
|
assert matcher(doc) == [(doc.vocab.strings["GoogleNow"], 2, 4)]
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_multi(matcher):
|
def test_matcher_match_multi(matcher):
|
||||||
words = ["I", "like", "Google", "Now", "and", "java", "best"]
|
words = ["I", "like", "Google", "Now", "and", "java", "best"]
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(matcher.vocab, words=words)
|
||||||
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
|
assert matcher(doc) == [
|
||||||
(doc.vocab.strings['Java'], 5, 6)]
|
(doc.vocab.strings["GoogleNow"], 2, 4),
|
||||||
|
(doc.vocab.strings["Java"], 5, 6),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_empty_dict(en_vocab):
|
def test_matcher_empty_dict(en_vocab):
|
||||||
"""Test matcher allows empty token specs, meaning match on any token."""
|
"""Test matcher allows empty token specs, meaning match on any token."""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
doc = Doc(matcher.vocab, words=["a", "b", "c"])
|
doc = Doc(matcher.vocab, words=["a", "b", "c"])
|
||||||
matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
|
matcher.add("A.C", None, [{"ORTH": "a"}, {}, {"ORTH": "c"}])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
assert matches[0][1:] == (0, 3)
|
assert matches[0][1:] == (0, 3)
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
|
matcher.add("A.", None, [{"ORTH": "a"}, {}])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert matches[0][1:] == (0, 2)
|
assert matches[0][1:] == (0, 2)
|
||||||
|
|
||||||
|
@ -107,8 +111,8 @@ def test_matcher_empty_dict(en_vocab):
|
||||||
def test_matcher_operator_shadow(en_vocab):
|
def test_matcher_operator_shadow(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
doc = Doc(matcher.vocab, words=["a", "b", "c"])
|
doc = Doc(matcher.vocab, words=["a", "b", "c"])
|
||||||
pattern = [{'ORTH': 'a'}, {"IS_ALPHA": True, "OP": "+"}, {'ORTH': 'c'}]
|
pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
|
||||||
matcher.add('A.C', None, pattern)
|
matcher.add("A.C", None, pattern)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
assert matches[0][1:] == (0, 3)
|
assert matches[0][1:] == (0, 3)
|
||||||
|
@ -117,43 +121,48 @@ def test_matcher_operator_shadow(en_vocab):
|
||||||
def test_matcher_match_zero(matcher):
|
def test_matcher_match_zero(matcher):
|
||||||
words1 = 'He said , " some words " ...'.split()
|
words1 = 'He said , " some words " ...'.split()
|
||||||
words2 = 'He said , " some three words " ...'.split()
|
words2 = 'He said , " some three words " ...'.split()
|
||||||
pattern1 = [{'ORTH': '"'},
|
pattern1 = [
|
||||||
{'OP': '!', 'IS_PUNCT': True},
|
{"ORTH": '"'},
|
||||||
{'OP': '!', 'IS_PUNCT': True},
|
{"OP": "!", "IS_PUNCT": True},
|
||||||
{'ORTH': '"'}]
|
{"OP": "!", "IS_PUNCT": True},
|
||||||
pattern2 = [{'ORTH': '"'},
|
{"ORTH": '"'},
|
||||||
{'IS_PUNCT': True},
|
]
|
||||||
{'IS_PUNCT': True},
|
pattern2 = [
|
||||||
{'IS_PUNCT': True},
|
{"ORTH": '"'},
|
||||||
{'ORTH': '"'}]
|
{"IS_PUNCT": True},
|
||||||
matcher.add('Quote', None, pattern1)
|
{"IS_PUNCT": True},
|
||||||
|
{"IS_PUNCT": True},
|
||||||
|
{"ORTH": '"'},
|
||||||
|
]
|
||||||
|
matcher.add("Quote", None, pattern1)
|
||||||
doc = Doc(matcher.vocab, words=words1)
|
doc = Doc(matcher.vocab, words=words1)
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
doc = Doc(matcher.vocab, words=words2)
|
doc = Doc(matcher.vocab, words=words2)
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
matcher.add('Quote', None, pattern2)
|
matcher.add("Quote", None, pattern2)
|
||||||
assert len(matcher(doc)) == 0
|
assert len(matcher(doc)) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_zero_plus(matcher):
|
def test_matcher_match_zero_plus(matcher):
|
||||||
words = 'He said , " some words " ...'.split()
|
words = 'He said , " some words " ...'.split()
|
||||||
pattern = [{'ORTH': '"'},
|
pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
|
||||||
{'OP': '*', 'IS_PUNCT': False},
|
|
||||||
{'ORTH': '"'}]
|
|
||||||
matcher = Matcher(matcher.vocab)
|
matcher = Matcher(matcher.vocab)
|
||||||
matcher.add('Quote', None, pattern)
|
matcher.add("Quote", None, pattern)
|
||||||
doc = Doc(matcher.vocab, words=words)
|
doc = Doc(matcher.vocab, words=words)
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_matcher_match_one_plus(matcher):
|
def test_matcher_match_one_plus(matcher):
|
||||||
control = Matcher(matcher.vocab)
|
control = Matcher(matcher.vocab)
|
||||||
control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
|
control.add("BasicPhilippe", None, [{"ORTH": "Philippe"}])
|
||||||
doc = Doc(control.vocab, words=['Philippe', 'Philippe'])
|
doc = Doc(control.vocab, words=["Philippe", "Philippe"])
|
||||||
m = control(doc)
|
m = control(doc)
|
||||||
assert len(m) == 2
|
assert len(m) == 2
|
||||||
matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
|
matcher.add(
|
||||||
{'ORTH': 'Philippe', 'OP': '+'}])
|
"KleenePhilippe",
|
||||||
|
None,
|
||||||
|
[{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}],
|
||||||
|
)
|
||||||
m = matcher(doc)
|
m = matcher(doc)
|
||||||
assert len(m) == 1
|
assert len(m) == 1
|
||||||
|
|
||||||
|
@ -161,54 +170,70 @@ def test_matcher_match_one_plus(matcher):
|
||||||
def test_matcher_any_token_operator(en_vocab):
|
def test_matcher_any_token_operator(en_vocab):
|
||||||
"""Test that patterns with "any token" {} work with operators."""
|
"""Test that patterns with "any token" {} work with operators."""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add('TEST', None, [{'ORTH': 'test'}, {'OP': '*'}])
|
matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}])
|
||||||
doc = Doc(en_vocab, words=['test', 'hello', 'world'])
|
doc = Doc(en_vocab, words=["test", "hello", "world"])
|
||||||
matches = [doc[start:end].text for _, start, end in matcher(doc)]
|
matches = [doc[start:end].text for _, start, end in matcher(doc)]
|
||||||
assert len(matches) == 3
|
assert len(matches) == 3
|
||||||
assert matches[0] == 'test'
|
assert matches[0] == "test"
|
||||||
assert matches[1] == 'test hello'
|
assert matches[1] == "test hello"
|
||||||
assert matches[2] == 'test hello world'
|
assert matches[2] == "test hello world"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def text():
|
def text():
|
||||||
return u"The quick brown fox jumped over the lazy fox"
|
return "The quick brown fox jumped over the lazy fox"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def heads():
|
def heads():
|
||||||
return [3,2,1,1,0,-1,2,1,-3]
|
return [3, 2, 1, 1, 0, -1, 2, 1, -3]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def deps():
|
def deps():
|
||||||
return ['det', 'amod', 'amod', 'nsubj', 'prep', 'pobj', 'det', 'amod']
|
return ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def dependency_tree_matcher(en_vocab):
|
def dependency_tree_matcher(en_vocab):
|
||||||
is_brown_yellow = lambda text: bool(re.compile(r'brown|yellow|over').match(text))
|
def is_brown_yellow(text):
|
||||||
|
return bool(re.compile(r"brown|yellow|over").match(text))
|
||||||
|
|
||||||
IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
|
IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
|
||||||
pattern1 = [
|
pattern1 = [
|
||||||
{'SPEC': {'NODE_NAME': 'fox'}, 'PATTERN': {'ORTH': 'fox'}},
|
{"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}},
|
||||||
{'SPEC': {'NODE_NAME': 'q', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'},'PATTERN': {'LOWER': u'quick'}},
|
{
|
||||||
{'SPEC': {'NODE_NAME': 'r', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'}, 'PATTERN': {IS_BROWN_YELLOW: True}}
|
"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
|
||||||
|
"PATTERN": {"LOWER": "quick"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
|
||||||
|
"PATTERN": {IS_BROWN_YELLOW: True},
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
pattern2 = [
|
pattern2 = [
|
||||||
{'SPEC': {'NODE_NAME': 'jumped'}, 'PATTERN': {'ORTH': 'jumped'}},
|
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
|
||||||
{'SPEC': {'NODE_NAME': 'fox', 'NBOR_RELOP': '>', 'NBOR_NAME': 'jumped'},'PATTERN': {'LOWER': u'fox'}},
|
{
|
||||||
{'SPEC': {'NODE_NAME': 'over', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'}, 'PATTERN': {IS_BROWN_YELLOW: True}}
|
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
|
||||||
|
"PATTERN": {"LOWER": "fox"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"SPEC": {"NODE_NAME": "over", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
|
||||||
|
"PATTERN": {IS_BROWN_YELLOW: True},
|
||||||
|
},
|
||||||
]
|
]
|
||||||
matcher = DependencyTreeMatcher(en_vocab)
|
matcher = DependencyTreeMatcher(en_vocab)
|
||||||
matcher.add('pattern1', None, pattern1)
|
matcher.add("pattern1", None, pattern1)
|
||||||
matcher.add('pattern2', None, pattern2)
|
matcher.add("pattern2", None, pattern2)
|
||||||
return matcher
|
return matcher
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_dependency_tree_matcher_compile(dependency_tree_matcher):
|
def test_dependency_tree_matcher_compile(dependency_tree_matcher):
|
||||||
assert len(dependency_tree_matcher) == 2
|
assert len(dependency_tree_matcher) == 2
|
||||||
|
|
||||||
def test_dependency_tree_matcher(dependency_tree_matcher,text,heads,deps):
|
|
||||||
doc = get_doc(dependency_tree_matcher.vocab,text.split(),heads=heads,deps=deps)
|
def test_dependency_tree_matcher(dependency_tree_matcher, text, heads, deps):
|
||||||
|
doc = get_doc(dependency_tree_matcher.vocab, text.split(), heads=heads, deps=deps)
|
||||||
matches = dependency_tree_matcher(doc)
|
matches = dependency_tree_matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
|
||||||
|
|
|
@ -7,17 +7,25 @@ from spacy.matcher import Matcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
pattern1 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}]
|
pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}]
|
||||||
pattern2 = [{'ORTH':'A', 'OP':'*'}, {'ORTH':'A', 'OP':'1'}]
|
pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A", "OP": "1"}]
|
||||||
pattern3 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'1'}]
|
pattern3 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "1"}]
|
||||||
pattern4 = [{'ORTH':'B', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]
|
pattern4 = [
|
||||||
pattern5 = [{'ORTH':'B', 'OP':'*'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]
|
{"ORTH": "B", "OP": "1"},
|
||||||
|
{"ORTH": "A", "OP": "*"},
|
||||||
|
{"ORTH": "B", "OP": "1"},
|
||||||
|
]
|
||||||
|
pattern5 = [
|
||||||
|
{"ORTH": "B", "OP": "*"},
|
||||||
|
{"ORTH": "A", "OP": "*"},
|
||||||
|
{"ORTH": "B", "OP": "1"},
|
||||||
|
]
|
||||||
|
|
||||||
re_pattern1 = 'AA*'
|
re_pattern1 = "AA*"
|
||||||
re_pattern2 = 'A*A'
|
re_pattern2 = "A*A"
|
||||||
re_pattern3 = 'AA'
|
re_pattern3 = "AA"
|
||||||
re_pattern4 = 'BA*B'
|
re_pattern4 = "BA*B"
|
||||||
re_pattern5 = 'B*A*B'
|
re_pattern5 = "B*A*B"
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -27,17 +35,20 @@ def text():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def doc(en_tokenizer, text):
|
def doc(en_tokenizer, text):
|
||||||
doc = en_tokenizer(' '.join(text))
|
doc = en_tokenizer(" ".join(text))
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.parametrize(
|
||||||
@pytest.mark.parametrize('pattern,re_pattern', [
|
"pattern,re_pattern",
|
||||||
(pattern1, re_pattern1),
|
[
|
||||||
(pattern2, re_pattern2),
|
pytest.param(pattern1, re_pattern1, marks=pytest.mark.xfail()),
|
||||||
(pattern3, re_pattern3),
|
pytest.param(pattern2, re_pattern2, marks=pytest.mark.xfail()),
|
||||||
(pattern4, re_pattern4),
|
pytest.param(pattern3, re_pattern3, marks=pytest.mark.xfail()),
|
||||||
(pattern5, re_pattern5)])
|
(pattern4, re_pattern4),
|
||||||
|
pytest.param(pattern5, re_pattern5, marks=pytest.mark.xfail()),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_greedy_matching(doc, text, pattern, re_pattern):
|
def test_greedy_matching(doc, text, pattern, re_pattern):
|
||||||
"""Test that the greedy matching behavior of the * op is consistant with
|
"""Test that the greedy matching behavior of the * op is consistant with
|
||||||
other re implementations."""
|
other re implementations."""
|
||||||
|
@ -50,12 +61,16 @@ def test_greedy_matching(doc, text, pattern, re_pattern):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('pattern,re_pattern', [
|
@pytest.mark.parametrize(
|
||||||
(pattern1, re_pattern1),
|
"pattern,re_pattern",
|
||||||
(pattern2, re_pattern2),
|
[
|
||||||
(pattern3, re_pattern3),
|
(pattern1, re_pattern1),
|
||||||
(pattern4, re_pattern4),
|
(pattern2, re_pattern2),
|
||||||
(pattern5, re_pattern5)])
|
(pattern3, re_pattern3),
|
||||||
|
(pattern4, re_pattern4),
|
||||||
|
(pattern5, re_pattern5),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_match_consuming(doc, text, pattern, re_pattern):
|
def test_match_consuming(doc, text, pattern, re_pattern):
|
||||||
"""Test that matcher.__call__ consumes tokens on a match similar to
|
"""Test that matcher.__call__ consumes tokens on a match similar to
|
||||||
re.findall."""
|
re.findall."""
|
||||||
|
@ -68,33 +83,33 @@ def test_match_consuming(doc, text, pattern, re_pattern):
|
||||||
|
|
||||||
def test_operator_combos(en_vocab):
|
def test_operator_combos(en_vocab):
|
||||||
cases = [
|
cases = [
|
||||||
('aaab', 'a a a b', True),
|
("aaab", "a a a b", True),
|
||||||
('aaab', 'a+ b', True),
|
("aaab", "a+ b", True),
|
||||||
('aaab', 'a+ a+ b', True),
|
("aaab", "a+ a+ b", True),
|
||||||
('aaab', 'a+ a+ a b', True),
|
("aaab", "a+ a+ a b", True),
|
||||||
('aaab', 'a+ a+ a+ b', True),
|
("aaab", "a+ a+ a+ b", True),
|
||||||
('aaab', 'a+ a a b', True),
|
("aaab", "a+ a a b", True),
|
||||||
('aaab', 'a+ a a', True),
|
("aaab", "a+ a a", True),
|
||||||
('aaab', 'a+', True),
|
("aaab", "a+", True),
|
||||||
('aaa', 'a+ b', False),
|
("aaa", "a+ b", False),
|
||||||
('aaa', 'a+ a+ b', False),
|
("aaa", "a+ a+ b", False),
|
||||||
('aaa', 'a+ a+ a+ b', False),
|
("aaa", "a+ a+ a+ b", False),
|
||||||
('aaa', 'a+ a b', False),
|
("aaa", "a+ a b", False),
|
||||||
('aaa', 'a+ a a b', False),
|
("aaa", "a+ a a b", False),
|
||||||
('aaab', 'a+ a a', True),
|
("aaab", "a+ a a", True),
|
||||||
('aaab', 'a+', True),
|
("aaab", "a+", True),
|
||||||
('aaab', 'a+ a b', True)
|
("aaab", "a+ a b", True),
|
||||||
]
|
]
|
||||||
for string, pattern_str, result in cases:
|
for string, pattern_str, result in cases:
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
doc = Doc(matcher.vocab, words=list(string))
|
doc = Doc(matcher.vocab, words=list(string))
|
||||||
pattern = []
|
pattern = []
|
||||||
for part in pattern_str.split():
|
for part in pattern_str.split():
|
||||||
if part.endswith('+'):
|
if part.endswith("+"):
|
||||||
pattern.append({'ORTH': part[0], 'OP': '+'})
|
pattern.append({"ORTH": part[0], "OP": "+"})
|
||||||
else:
|
else:
|
||||||
pattern.append({'ORTH': part})
|
pattern.append({"ORTH": part})
|
||||||
matcher.add('PATTERN', None, pattern)
|
matcher.add("PATTERN", None, pattern)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
if result:
|
if result:
|
||||||
assert matches, (string, pattern_str)
|
assert matches, (string, pattern_str)
|
||||||
|
@ -105,12 +120,12 @@ def test_operator_combos(en_vocab):
|
||||||
def test_matcher_end_zero_plus(en_vocab):
|
def test_matcher_end_zero_plus(en_vocab):
|
||||||
"""Test matcher works when patterns end with * operator. (issue 1450)"""
|
"""Test matcher works when patterns end with * operator. (issue 1450)"""
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]
|
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
|
||||||
matcher.add('TSTEND', None, pattern)
|
matcher.add("TSTEND", None, pattern)
|
||||||
nlp = lambda string: Doc(matcher.vocab, words=string.split())
|
nlp = lambda string: Doc(matcher.vocab, words=string.split())
|
||||||
assert len(matcher(nlp('a'))) == 1
|
assert len(matcher(nlp("a"))) == 1
|
||||||
assert len(matcher(nlp('a b'))) == 2
|
assert len(matcher(nlp("a b"))) == 2
|
||||||
assert len(matcher(nlp('a c'))) == 1
|
assert len(matcher(nlp("a c"))) == 1
|
||||||
assert len(matcher(nlp('a b c'))) == 2
|
assert len(matcher(nlp("a b c"))) == 2
|
||||||
assert len(matcher(nlp('a b b c'))) == 3
|
assert len(matcher(nlp("a b b c"))) == 3
|
||||||
assert len(matcher(nlp('a b b'))) == 3
|
assert len(matcher(nlp("a b b"))) == 3
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
|
||||||
|
@ -11,7 +10,7 @@ from ..util import get_doc
|
||||||
def test_matcher_phrase_matcher(en_vocab):
|
def test_matcher_phrase_matcher(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["Google", "Now"])
|
doc = Doc(en_vocab, words=["Google", "Now"])
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add('COMPANY', None, doc)
|
matcher.add("COMPANY", None, doc)
|
||||||
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
|
|
||||||
|
@ -19,63 +18,63 @@ def test_matcher_phrase_matcher(en_vocab):
|
||||||
def test_phrase_matcher_length(en_vocab):
|
def test_phrase_matcher_length(en_vocab):
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
assert len(matcher) == 0
|
assert len(matcher) == 0
|
||||||
matcher.add('TEST', None, Doc(en_vocab, words=['test']))
|
matcher.add("TEST", None, Doc(en_vocab, words=["test"]))
|
||||||
assert len(matcher) == 1
|
assert len(matcher) == 1
|
||||||
matcher.add('TEST2', None, Doc(en_vocab, words=['test2']))
|
matcher.add("TEST2", None, Doc(en_vocab, words=["test2"]))
|
||||||
assert len(matcher) == 2
|
assert len(matcher) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_matcher_contains(en_vocab):
|
def test_phrase_matcher_contains(en_vocab):
|
||||||
matcher = PhraseMatcher(en_vocab)
|
matcher = PhraseMatcher(en_vocab)
|
||||||
matcher.add('TEST', None, Doc(en_vocab, words=['test']))
|
matcher.add("TEST", None, Doc(en_vocab, words=["test"]))
|
||||||
assert 'TEST' in matcher
|
assert "TEST" in matcher
|
||||||
assert 'TEST2' not in matcher
|
assert "TEST2" not in matcher
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_matcher_string_attrs(en_vocab):
|
def test_phrase_matcher_string_attrs(en_vocab):
|
||||||
words1 = ['I', 'like', 'cats']
|
words1 = ["I", "like", "cats"]
|
||||||
pos1 = ['PRON', 'VERB', 'NOUN']
|
pos1 = ["PRON", "VERB", "NOUN"]
|
||||||
words2 = ['Yes', ',', 'you', 'hate', 'dogs', 'very', 'much']
|
words2 = ["Yes", ",", "you", "hate", "dogs", "very", "much"]
|
||||||
pos2 = ['INTJ', 'PUNCT', 'PRON', 'VERB', 'NOUN', 'ADV', 'ADV']
|
pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
|
||||||
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
||||||
matcher = PhraseMatcher(en_vocab, attr='POS')
|
matcher = PhraseMatcher(en_vocab, attr="POS")
|
||||||
matcher.add('TEST', None, pattern)
|
matcher.add("TEST", None, pattern)
|
||||||
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
match_id, start, end = matches[0]
|
match_id, start, end = matches[0]
|
||||||
assert match_id == en_vocab.strings['TEST']
|
assert match_id == en_vocab.strings["TEST"]
|
||||||
assert start == 2
|
assert start == 2
|
||||||
assert end == 5
|
assert end == 5
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_matcher_string_attrs_negative(en_vocab):
|
def test_phrase_matcher_string_attrs_negative(en_vocab):
|
||||||
"""Test that token with the control codes as ORTH are *not* matched."""
|
"""Test that token with the control codes as ORTH are *not* matched."""
|
||||||
words1 = ['I', 'like', 'cats']
|
words1 = ["I", "like", "cats"]
|
||||||
pos1 = ['PRON', 'VERB', 'NOUN']
|
pos1 = ["PRON", "VERB", "NOUN"]
|
||||||
words2 = ['matcher:POS-PRON', 'matcher:POS-VERB', 'matcher:POS-NOUN']
|
words2 = ["matcher:POS-PRON", "matcher:POS-VERB", "matcher:POS-NOUN"]
|
||||||
pos2 = ['X', 'X', 'X']
|
pos2 = ["X", "X", "X"]
|
||||||
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
pattern = get_doc(en_vocab, words=words1, pos=pos1)
|
||||||
matcher = PhraseMatcher(en_vocab, attr='POS')
|
matcher = PhraseMatcher(en_vocab, attr="POS")
|
||||||
matcher.add('TEST', None, pattern)
|
matcher.add("TEST", None, pattern)
|
||||||
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
doc = get_doc(en_vocab, words=words2, pos=pos2)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 0
|
assert len(matches) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_matcher_bool_attrs(en_vocab):
|
def test_phrase_matcher_bool_attrs(en_vocab):
|
||||||
words1 = ['Hello', 'world', '!']
|
words1 = ["Hello", "world", "!"]
|
||||||
words2 = ['No', 'problem', ',', 'he', 'said', '.']
|
words2 = ["No", "problem", ",", "he", "said", "."]
|
||||||
pattern = Doc(en_vocab, words=words1)
|
pattern = Doc(en_vocab, words=words1)
|
||||||
matcher = PhraseMatcher(en_vocab, attr='IS_PUNCT')
|
matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT")
|
||||||
matcher.add('TEST', None, pattern)
|
matcher.add("TEST", None, pattern)
|
||||||
doc = Doc(en_vocab, words=words2)
|
doc = Doc(en_vocab, words=words2)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
match_id1, start1, end1 = matches[0]
|
match_id1, start1, end1 = matches[0]
|
||||||
match_id2, start2, end2 = matches[1]
|
match_id2, start2, end2 = matches[1]
|
||||||
assert match_id1 == en_vocab.strings['TEST']
|
assert match_id1 == en_vocab.strings["TEST"]
|
||||||
assert match_id2 == en_vocab.strings['TEST']
|
assert match_id2 == en_vocab.strings["TEST"]
|
||||||
assert start1 == 0
|
assert start1 == 0
|
||||||
assert end1 == 3
|
assert end1 == 3
|
||||||
assert start2 == 3
|
assert start2 == 3
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import numpy.random
|
|
||||||
from thinc.neural.optimizers import Adam
|
from thinc.neural.optimizers import Adam
|
||||||
from thinc.neural.ops import NumpyOps
|
from thinc.neural.ops import NumpyOps
|
||||||
from spacy.attrs import NORM
|
from spacy.attrs import NORM
|
||||||
|
@ -20,18 +19,17 @@ def vocab():
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab):
|
def parser(vocab):
|
||||||
parser = DependencyParser(vocab)
|
parser = DependencyParser(vocab)
|
||||||
parser.cfg['token_vector_width'] = 8
|
parser.cfg["token_vector_width"] = 8
|
||||||
parser.cfg['hidden_width'] = 30
|
parser.cfg["hidden_width"] = 30
|
||||||
parser.cfg['hist_size'] = 0
|
parser.cfg["hist_size"] = 0
|
||||||
parser.add_label('left')
|
parser.add_label("left")
|
||||||
parser.begin_training([], **parser.cfg)
|
parser.begin_training([], **parser.cfg)
|
||||||
sgd = Adam(NumpyOps(), 0.001)
|
sgd = Adam(NumpyOps(), 0.001)
|
||||||
|
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
losses = {}
|
losses = {}
|
||||||
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(vocab, words=["a", "b", "c", "d"])
|
||||||
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
|
||||||
deps=['left', 'ROOT', 'left', 'ROOT'])
|
|
||||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
@ -44,29 +42,30 @@ def test_init_parser(parser):
|
||||||
# TODO: This now seems to be implicated in segfaults. Not sure what's up!
|
# TODO: This now seems to be implicated in segfaults. Not sure what's up!
|
||||||
@pytest.mark.skip
|
@pytest.mark.skip
|
||||||
def test_add_label(parser):
|
def test_add_label(parser):
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
assert doc[0].head.i == 1
|
assert doc[0].head.i == 1
|
||||||
assert doc[0].dep_ == 'left'
|
assert doc[0].dep_ == "left"
|
||||||
assert doc[1].head.i == 1
|
assert doc[1].head.i == 1
|
||||||
assert doc[2].head.i == 3
|
assert doc[2].head.i == 3
|
||||||
assert doc[2].head.i == 3
|
assert doc[2].head.i == 3
|
||||||
parser.add_label('right')
|
parser.add_label("right")
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
assert doc[0].head.i == 1
|
assert doc[0].head.i == 1
|
||||||
assert doc[0].dep_ == 'left'
|
assert doc[0].dep_ == "left"
|
||||||
assert doc[1].head.i == 1
|
assert doc[1].head.i == 1
|
||||||
assert doc[2].head.i == 3
|
assert doc[2].head.i == 3
|
||||||
assert doc[2].head.i == 3
|
assert doc[2].head.i == 3
|
||||||
sgd = Adam(NumpyOps(), 0.001)
|
sgd = Adam(NumpyOps(), 0.001)
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
losses = {}
|
losses = {}
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
gold = GoldParse(
|
||||||
deps=['right', 'ROOT', 'left', 'ROOT'])
|
doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"]
|
||||||
|
)
|
||||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
assert doc[0].dep_ == 'right'
|
assert doc[0].dep_ == "right"
|
||||||
assert doc[2].dep_ == 'left'
|
assert doc[2].dep_ == "left"
|
||||||
|
|
|
@ -31,16 +31,19 @@ def get_sequence_costs(M, words, heads, deps, transitions):
|
||||||
def vocab():
|
def vocab():
|
||||||
return Vocab()
|
return Vocab()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def arc_eager(vocab):
|
def arc_eager(vocab):
|
||||||
moves = ArcEager(vocab.strings, ArcEager.get_actions())
|
moves = ArcEager(vocab.strings, ArcEager.get_actions())
|
||||||
moves.add_action(2, 'left')
|
moves.add_action(2, "left")
|
||||||
moves.add_action(3, 'right')
|
moves.add_action(3, "right")
|
||||||
return moves
|
return moves
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def words():
|
def words():
|
||||||
return ['a', 'b']
|
return ["a", "b"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def doc(words, vocab):
|
def doc(words, vocab):
|
||||||
|
@ -48,19 +51,21 @@ def doc(words, vocab):
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
return Doc(vocab, words=list(words))
|
return Doc(vocab, words=list(words))
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def gold(doc, words):
|
def gold(doc, words):
|
||||||
if len(words) == 2:
|
if len(words) == 2:
|
||||||
return GoldParse(doc, words=['a', 'b'], heads=[0, 0], deps=['ROOT', 'right'])
|
return GoldParse(doc, words=["a", "b"], heads=[0, 0], deps=["ROOT", "right"])
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_oracle_four_words(arc_eager, vocab):
|
def test_oracle_four_words(arc_eager, vocab):
|
||||||
words = ['a', 'b', 'c', 'd']
|
words = ["a", "b", "c", "d"]
|
||||||
heads = [1, 1, 3, 3]
|
heads = [1, 1, 3, 3]
|
||||||
deps = ['left', 'ROOT', 'left', 'ROOT']
|
deps = ["left", "ROOT", "left", "ROOT"]
|
||||||
actions = ['L-left', 'B-ROOT', 'L-left']
|
actions = ["L-left", "B-ROOT", "L-left"]
|
||||||
state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions)
|
state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions)
|
||||||
assert state.is_final()
|
assert state.is_final()
|
||||||
for i, state_costs in enumerate(cost_history):
|
for i, state_costs in enumerate(cost_history):
|
||||||
|
@ -72,63 +77,65 @@ def test_oracle_four_words(arc_eager, vocab):
|
||||||
|
|
||||||
|
|
||||||
annot_tuples = [
|
annot_tuples = [
|
||||||
(0, 'When', 'WRB', 11, 'advmod', 'O'),
|
(0, "When", "WRB", 11, "advmod", "O"),
|
||||||
(1, 'Walter', 'NNP', 2, 'compound', 'B-PERSON'),
|
(1, "Walter", "NNP", 2, "compound", "B-PERSON"),
|
||||||
(2, 'Rodgers', 'NNP', 11, 'nsubj', 'L-PERSON'),
|
(2, "Rodgers", "NNP", 11, "nsubj", "L-PERSON"),
|
||||||
(3, ',', ',', 2, 'punct', 'O'),
|
(3, ",", ",", 2, "punct", "O"),
|
||||||
(4, 'our', 'PRP$', 6, 'poss', 'O'),
|
(4, "our", "PRP$", 6, "poss", "O"),
|
||||||
(5, 'embedded', 'VBN', 6, 'amod', 'O'),
|
(5, "embedded", "VBN", 6, "amod", "O"),
|
||||||
(6, 'reporter', 'NN', 2, 'appos', 'O'),
|
(6, "reporter", "NN", 2, "appos", "O"),
|
||||||
(7, 'with', 'IN', 6, 'prep', 'O'),
|
(7, "with", "IN", 6, "prep", "O"),
|
||||||
(8, 'the', 'DT', 10, 'det', 'B-ORG'),
|
(8, "the", "DT", 10, "det", "B-ORG"),
|
||||||
(9, '3rd', 'NNP', 10, 'compound', 'I-ORG'),
|
(9, "3rd", "NNP", 10, "compound", "I-ORG"),
|
||||||
(10, 'Cavalry', 'NNP', 7, 'pobj', 'L-ORG'),
|
(10, "Cavalry", "NNP", 7, "pobj", "L-ORG"),
|
||||||
(11, 'says', 'VBZ', 44, 'advcl', 'O'),
|
(11, "says", "VBZ", 44, "advcl", "O"),
|
||||||
(12, 'three', 'CD', 13, 'nummod', 'U-CARDINAL'),
|
(12, "three", "CD", 13, "nummod", "U-CARDINAL"),
|
||||||
(13, 'battalions', 'NNS', 16, 'nsubj', 'O'),
|
(13, "battalions", "NNS", 16, "nsubj", "O"),
|
||||||
(14, 'of', 'IN', 13, 'prep', 'O'),
|
(14, "of", "IN", 13, "prep", "O"),
|
||||||
(15, 'troops', 'NNS', 14, 'pobj', 'O'),
|
(15, "troops", "NNS", 14, "pobj", "O"),
|
||||||
(16, 'are', 'VBP', 11, 'ccomp', 'O'),
|
(16, "are", "VBP", 11, "ccomp", "O"),
|
||||||
(17, 'on', 'IN', 16, 'prep', 'O'),
|
(17, "on", "IN", 16, "prep", "O"),
|
||||||
(18, 'the', 'DT', 19, 'det', 'O'),
|
(18, "the", "DT", 19, "det", "O"),
|
||||||
(19, 'ground', 'NN', 17, 'pobj', 'O'),
|
(19, "ground", "NN", 17, "pobj", "O"),
|
||||||
(20, ',', ',', 17, 'punct', 'O'),
|
(20, ",", ",", 17, "punct", "O"),
|
||||||
(21, 'inside', 'IN', 17, 'prep', 'O'),
|
(21, "inside", "IN", 17, "prep", "O"),
|
||||||
(22, 'Baghdad', 'NNP', 21, 'pobj', 'U-GPE'),
|
(22, "Baghdad", "NNP", 21, "pobj", "U-GPE"),
|
||||||
(23, 'itself', 'PRP', 22, 'appos', 'O'),
|
(23, "itself", "PRP", 22, "appos", "O"),
|
||||||
(24, ',', ',', 16, 'punct', 'O'),
|
(24, ",", ",", 16, "punct", "O"),
|
||||||
(25, 'have', 'VBP', 26, 'aux', 'O'),
|
(25, "have", "VBP", 26, "aux", "O"),
|
||||||
(26, 'taken', 'VBN', 16, 'dep', 'O'),
|
(26, "taken", "VBN", 16, "dep", "O"),
|
||||||
(27, 'up', 'RP', 26, 'prt', 'O'),
|
(27, "up", "RP", 26, "prt", "O"),
|
||||||
(28, 'positions', 'NNS', 26, 'dobj', 'O'),
|
(28, "positions", "NNS", 26, "dobj", "O"),
|
||||||
(29, 'they', 'PRP', 31, 'nsubj', 'O'),
|
(29, "they", "PRP", 31, "nsubj", "O"),
|
||||||
(30, "'re", 'VBP', 31, 'aux', 'O'),
|
(30, "'re", "VBP", 31, "aux", "O"),
|
||||||
(31, 'going', 'VBG', 26, 'parataxis', 'O'),
|
(31, "going", "VBG", 26, "parataxis", "O"),
|
||||||
(32, 'to', 'TO', 33, 'aux', 'O'),
|
(32, "to", "TO", 33, "aux", "O"),
|
||||||
(33, 'spend', 'VB', 31, 'xcomp', 'O'),
|
(33, "spend", "VB", 31, "xcomp", "O"),
|
||||||
(34, 'the', 'DT', 35, 'det', 'B-TIME'),
|
(34, "the", "DT", 35, "det", "B-TIME"),
|
||||||
(35, 'night', 'NN', 33, 'dobj', 'L-TIME'),
|
(35, "night", "NN", 33, "dobj", "L-TIME"),
|
||||||
(36, 'there', 'RB', 33, 'advmod', 'O'),
|
(36, "there", "RB", 33, "advmod", "O"),
|
||||||
(37, 'presumably', 'RB', 33, 'advmod', 'O'),
|
(37, "presumably", "RB", 33, "advmod", "O"),
|
||||||
(38, ',', ',', 44, 'punct', 'O'),
|
(38, ",", ",", 44, "punct", "O"),
|
||||||
(39, 'how', 'WRB', 40, 'advmod', 'O'),
|
(39, "how", "WRB", 40, "advmod", "O"),
|
||||||
(40, 'many', 'JJ', 41, 'amod', 'O'),
|
(40, "many", "JJ", 41, "amod", "O"),
|
||||||
(41, 'soldiers', 'NNS', 44, 'pobj', 'O'),
|
(41, "soldiers", "NNS", 44, "pobj", "O"),
|
||||||
(42, 'are', 'VBP', 44, 'aux', 'O'),
|
(42, "are", "VBP", 44, "aux", "O"),
|
||||||
(43, 'we', 'PRP', 44, 'nsubj', 'O'),
|
(43, "we", "PRP", 44, "nsubj", "O"),
|
||||||
(44, 'talking', 'VBG', 44, 'ROOT', 'O'),
|
(44, "talking", "VBG", 44, "ROOT", "O"),
|
||||||
(45, 'about', 'IN', 44, 'prep', 'O'),
|
(45, "about", "IN", 44, "prep", "O"),
|
||||||
(46, 'right', 'RB', 47, 'advmod', 'O'),
|
(46, "right", "RB", 47, "advmod", "O"),
|
||||||
(47, 'now', 'RB', 44, 'advmod', 'O'),
|
(47, "now", "RB", 44, "advmod", "O"),
|
||||||
(48, '?', '.', 44, 'punct', 'O')]
|
(48, "?", ".", 44, "punct", "O"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_get_oracle_actions():
|
def test_get_oracle_actions():
|
||||||
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
||||||
parser = DependencyParser(doc.vocab)
|
parser = DependencyParser(doc.vocab)
|
||||||
parser.moves.add_action(0, '')
|
parser.moves.add_action(0, "")
|
||||||
parser.moves.add_action(1, '')
|
parser.moves.add_action(1, "")
|
||||||
parser.moves.add_action(1, '')
|
parser.moves.add_action(1, "")
|
||||||
parser.moves.add_action(4, 'ROOT')
|
parser.moves.add_action(4, "ROOT")
|
||||||
for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
|
for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
|
||||||
if head > i:
|
if head > i:
|
||||||
parser.moves.add_action(2, dep)
|
parser.moves.add_action(2, dep)
|
||||||
|
|
|
@ -16,15 +16,17 @@ def vocab():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def doc(vocab):
|
def doc(vocab):
|
||||||
return Doc(vocab, words=['Casey', 'went', 'to', 'New', 'York', '.'])
|
return Doc(vocab, words=["Casey", "went", "to", "New", "York", "."])
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def entity_annots(doc):
|
def entity_annots(doc):
|
||||||
casey = doc[0:1]
|
casey = doc[0:1]
|
||||||
ny = doc[3:5]
|
ny = doc[3:5]
|
||||||
return [(casey.start_char, casey.end_char, 'PERSON'),
|
return [
|
||||||
(ny.start_char, ny.end_char, 'GPE')]
|
(casey.start_char, casey.end_char, "PERSON"),
|
||||||
|
(ny.start_char, ny.end_char, "GPE"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -43,32 +45,33 @@ def test_get_oracle_moves(tsys, doc, entity_annots):
|
||||||
tsys.preprocess_gold(gold)
|
tsys.preprocess_gold(gold)
|
||||||
act_classes = tsys.get_oracle_sequence(doc, gold)
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||||
names = [tsys.get_class_name(act) for act in act_classes]
|
names = [tsys.get_class_name(act) for act in act_classes]
|
||||||
assert names == ['U-PERSON', 'O', 'O', 'B-GPE', 'L-GPE', 'O']
|
assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
|
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
|
||||||
entity_annots = [(s, e, '!' + label) for s, e, label in entity_annots]
|
entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
|
||||||
gold = GoldParse(doc, entities=entity_annots)
|
gold = GoldParse(doc, entities=entity_annots)
|
||||||
for i, tag in enumerate(gold.ner):
|
for i, tag in enumerate(gold.ner):
|
||||||
if tag == 'L-!GPE':
|
if tag == "L-!GPE":
|
||||||
gold.ner[i] = '-'
|
gold.ner[i] = "-"
|
||||||
tsys.preprocess_gold(gold)
|
tsys.preprocess_gold(gold)
|
||||||
act_classes = tsys.get_oracle_sequence(doc, gold)
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||||
names = [tsys.get_class_name(act) for act in act_classes]
|
names = [tsys.get_class_name(act) for act in act_classes]
|
||||||
|
|
||||||
|
|
||||||
def test_get_oracle_moves_negative_entities2(tsys, vocab):
|
def test_get_oracle_moves_negative_entities2(tsys, vocab):
|
||||||
doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
|
doc = Doc(vocab, words=["A", "B", "C", "D"])
|
||||||
gold = GoldParse(doc, entities=[])
|
gold = GoldParse(doc, entities=[])
|
||||||
gold.ner = ['B-!PERSON', 'L-!PERSON', 'B-!PERSON', 'L-!PERSON']
|
gold.ner = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"]
|
||||||
tsys.preprocess_gold(gold)
|
tsys.preprocess_gold(gold)
|
||||||
act_classes = tsys.get_oracle_sequence(doc, gold)
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||||
names = [tsys.get_class_name(act) for act in act_classes]
|
names = [tsys.get_class_name(act) for act in act_classes]
|
||||||
|
|
||||||
|
|
||||||
def test_get_oracle_moves_negative_O(tsys, vocab):
|
def test_get_oracle_moves_negative_O(tsys, vocab):
|
||||||
doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
|
doc = Doc(vocab, words=["A", "B", "C", "D"])
|
||||||
gold = GoldParse(doc, entities=[])
|
gold = GoldParse(doc, entities=[])
|
||||||
gold.ner = ['O', '!O', 'O', '!O']
|
gold.ner = ["O", "!O", "O", "!O"]
|
||||||
tsys.preprocess_gold(gold)
|
tsys.preprocess_gold(gold)
|
||||||
act_classes = tsys.get_oracle_sequence(doc, gold)
|
act_classes = tsys.get_oracle_sequence(doc, gold)
|
||||||
names = [tsys.get_class_name(act) for act in act_classes]
|
names = [tsys.get_class_name(act) for act in act_classes]
|
||||||
|
@ -80,8 +83,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
|
||||||
ner.begin_training([])
|
ner.begin_training([])
|
||||||
ner(doc)
|
ner(doc)
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
assert [w.ent_iob_ for w in doc] == (['O'] * len(doc))
|
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
|
||||||
doc.ents = [(doc.vocab.strings['ANIMAL'], 3, 4)]
|
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
|
||||||
assert [w.ent_iob_ for w in doc] == ['', '', '', 'B']
|
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
|
||||||
doc.ents = [(doc.vocab.strings['WORD'], 0, 2)]
|
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
|
||||||
assert [w.ent_iob_ for w in doc] == ['B', 'I', '', '']
|
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
|
||||||
|
|
|
@ -17,7 +17,7 @@ def vocab():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def arc_eager(vocab):
|
def arc_eager(vocab):
|
||||||
actions = ArcEager.get_actions(left_labels=['L'], right_labels=['R'])
|
actions = ArcEager.get_actions(left_labels=["L"], right_labels=["R"])
|
||||||
return ArcEager(vocab.strings, actions)
|
return ArcEager(vocab.strings, actions)
|
||||||
|
|
||||||
|
|
||||||
|
@ -30,6 +30,7 @@ def tok2vec():
|
||||||
def parser(vocab, arc_eager):
|
def parser(vocab, arc_eager):
|
||||||
return Parser(vocab, moves=arc_eager, model=None)
|
return Parser(vocab, moves=arc_eager, model=None)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def model(arc_eager, tok2vec):
|
def model(arc_eager, tok2vec):
|
||||||
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
|
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
|
||||||
|
@ -37,12 +38,12 @@ def model(arc_eager, tok2vec):
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def doc(vocab):
|
def doc(vocab):
|
||||||
return Doc(vocab, words=['a', 'b', 'c'])
|
return Doc(vocab, words=["a", "b", "c"])
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def gold(doc):
|
def gold(doc):
|
||||||
return GoldParse(doc, heads=[1, 1, 1], deps=['L', 'ROOT', 'R'])
|
return GoldParse(doc, heads=[1, 1, 1], deps=["L", "ROOT", "R"])
|
||||||
|
|
||||||
|
|
||||||
def test_can_init_nn_parser(parser):
|
def test_can_init_nn_parser(parser):
|
||||||
|
@ -62,8 +63,10 @@ def test_predict_doc(parser, tok2vec, model, doc):
|
||||||
|
|
||||||
def test_update_doc(parser, model, doc, gold):
|
def test_update_doc(parser, model, doc, gold):
|
||||||
parser.model = model
|
parser.model = model
|
||||||
|
|
||||||
def optimize(weights, gradient, key=None):
|
def optimize(weights, gradient, key=None):
|
||||||
weights -= 0.001 * gradient
|
weights -= 0.001 * gradient
|
||||||
|
|
||||||
parser.update([doc], [gold], sgd=optimize)
|
parser.update([doc], [gold], sgd=optimize)
|
||||||
|
|
||||||
|
|
||||||
|
@ -76,6 +79,8 @@ def test_predict_doc_beam(parser, model, doc):
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_update_doc_beam(parser, model, doc, gold):
|
def test_update_doc_beam(parser, model, doc, gold):
|
||||||
parser.model = model
|
parser.model = model
|
||||||
|
|
||||||
def optimize(weights, gradient, key=None):
|
def optimize(weights, gradient, key=None):
|
||||||
weights -= 0.001 * gradient
|
weights -= 0.001 * gradient
|
||||||
|
|
||||||
parser.update_beam([doc], [gold], sgd=optimize)
|
parser.update_beam([doc], [gold], sgd=optimize)
|
||||||
|
|
|
@ -21,20 +21,22 @@ def vocab():
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def moves(vocab):
|
def moves(vocab):
|
||||||
aeager = ArcEager(vocab.strings, {})
|
aeager = ArcEager(vocab.strings, {})
|
||||||
aeager.add_action(2, 'nsubj')
|
aeager.add_action(2, "nsubj")
|
||||||
aeager.add_action(3, 'dobj')
|
aeager.add_action(3, "dobj")
|
||||||
aeager.add_action(2, 'aux')
|
aeager.add_action(2, "aux")
|
||||||
return aeager
|
return aeager
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def docs(vocab):
|
def docs(vocab):
|
||||||
return [Doc(vocab, words=['Rats', 'bite', 'things'])]
|
return [Doc(vocab, words=["Rats", "bite", "things"])]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def states(docs):
|
def states(docs):
|
||||||
return [StateClass(doc) for doc in docs]
|
return [StateClass(doc) for doc in docs]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def tokvecs(docs, vector_size):
|
def tokvecs(docs, vector_size):
|
||||||
output = []
|
output = []
|
||||||
|
@ -73,9 +75,10 @@ def beam(moves, states, golds, beam_width):
|
||||||
def scores(moves, batch_size, beam_width):
|
def scores(moves, batch_size, beam_width):
|
||||||
return [
|
return [
|
||||||
numpy.asarray(
|
numpy.asarray(
|
||||||
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
|
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), dtype="f"
|
||||||
dtype='f')
|
)
|
||||||
for _ in range(batch_size)]
|
for _ in range(batch_size)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_create_beam(beam):
|
def test_create_beam(beam):
|
||||||
|
@ -93,8 +96,8 @@ def test_beam_advance_too_few_scores(beam, scores):
|
||||||
|
|
||||||
def test_beam_parse():
|
def test_beam_parse():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe(DependencyParser(nlp.vocab), name='parser')
|
nlp.add_pipe(DependencyParser(nlp.vocab), name="parser")
|
||||||
nlp.parser.add_label('nsubj')
|
nlp.parser.add_label("nsubj")
|
||||||
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
|
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
|
||||||
doc = nlp.make_doc('Australia is a country')
|
doc = nlp.make_doc("Australia is a country")
|
||||||
nlp.parser(doc, beam_width=2)
|
nlp.parser(doc, beam_width=2)
|
||||||
|
|
|
@ -40,106 +40,116 @@ def multirooted_tree():
|
||||||
|
|
||||||
|
|
||||||
def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
|
def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||||
assert([a for a in ancestors(3, tree)] == [4, 5, 2])
|
assert [a for a in ancestors(3, tree)] == [4, 5, 2]
|
||||||
assert([a for a in ancestors(3, cyclic_tree)] == [4, 5, 3, 4, 5, 3, 4])
|
assert [a for a in ancestors(3, cyclic_tree)] == [4, 5, 3, 4, 5, 3, 4]
|
||||||
assert([a for a in ancestors(3, partial_tree)] == [4, 5, None])
|
assert [a for a in ancestors(3, partial_tree)] == [4, 5, None]
|
||||||
assert([a for a in ancestors(17, multirooted_tree)] == [])
|
assert [a for a in ancestors(17, multirooted_tree)] == []
|
||||||
|
|
||||||
|
|
||||||
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
|
||||||
assert(contains_cycle(tree) == None)
|
assert contains_cycle(tree) == None
|
||||||
assert(contains_cycle(cyclic_tree) == set([3, 4, 5]))
|
assert contains_cycle(cyclic_tree) == set([3, 4, 5])
|
||||||
assert(contains_cycle(partial_tree) == None)
|
assert contains_cycle(partial_tree) == None
|
||||||
assert(contains_cycle(multirooted_tree) == None)
|
assert contains_cycle(multirooted_tree) == None
|
||||||
|
|
||||||
|
|
||||||
def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree):
|
def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree):
|
||||||
assert(is_nonproj_arc(0, nonproj_tree) == False)
|
assert is_nonproj_arc(0, nonproj_tree) == False
|
||||||
assert(is_nonproj_arc(1, nonproj_tree) == False)
|
assert is_nonproj_arc(1, nonproj_tree) == False
|
||||||
assert(is_nonproj_arc(2, nonproj_tree) == False)
|
assert is_nonproj_arc(2, nonproj_tree) == False
|
||||||
assert(is_nonproj_arc(3, nonproj_tree) == False)
|
assert is_nonproj_arc(3, nonproj_tree) == False
|
||||||
assert(is_nonproj_arc(4, nonproj_tree) == False)
|
assert is_nonproj_arc(4, nonproj_tree) == False
|
||||||
assert(is_nonproj_arc(5, nonproj_tree) == False)
|
assert is_nonproj_arc(5, nonproj_tree) == False
|
||||||
assert(is_nonproj_arc(6, nonproj_tree) == False)
|
assert is_nonproj_arc(6, nonproj_tree) == False
|
||||||
assert(is_nonproj_arc(7, nonproj_tree) == True)
|
assert is_nonproj_arc(7, nonproj_tree) == True
|
||||||
assert(is_nonproj_arc(8, nonproj_tree) == False)
|
assert is_nonproj_arc(8, nonproj_tree) == False
|
||||||
assert(is_nonproj_arc(7, partial_tree) == False)
|
assert is_nonproj_arc(7, partial_tree) == False
|
||||||
assert(is_nonproj_arc(17, multirooted_tree) == False)
|
assert is_nonproj_arc(17, multirooted_tree) == False
|
||||||
assert(is_nonproj_arc(16, multirooted_tree) == True)
|
assert is_nonproj_arc(16, multirooted_tree) == True
|
||||||
|
|
||||||
|
|
||||||
def test_parser_is_nonproj_tree(proj_tree, nonproj_tree, partial_tree, multirooted_tree):
|
def test_parser_is_nonproj_tree(
|
||||||
assert(is_nonproj_tree(proj_tree) == False)
|
proj_tree, nonproj_tree, partial_tree, multirooted_tree
|
||||||
assert(is_nonproj_tree(nonproj_tree) == True)
|
):
|
||||||
assert(is_nonproj_tree(partial_tree) == False)
|
assert is_nonproj_tree(proj_tree) == False
|
||||||
assert(is_nonproj_tree(multirooted_tree) == True)
|
assert is_nonproj_tree(nonproj_tree) == True
|
||||||
|
assert is_nonproj_tree(partial_tree) == False
|
||||||
|
assert is_nonproj_tree(multirooted_tree) == True
|
||||||
|
|
||||||
|
|
||||||
def test_parser_pseudoprojectivity(en_tokenizer):
|
def test_parser_pseudoprojectivity(en_tokenizer):
|
||||||
def deprojectivize(proj_heads, deco_labels):
|
def deprojectivize(proj_heads, deco_labels):
|
||||||
tokens = en_tokenizer('whatever ' * len(proj_heads))
|
tokens = en_tokenizer("whatever " * len(proj_heads))
|
||||||
rel_proj_heads = [head-i for i, head in enumerate(proj_heads)]
|
rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens],
|
doc = get_doc(
|
||||||
deps=deco_labels, heads=rel_proj_heads)
|
tokens.vocab,
|
||||||
|
words=[t.text for t in tokens],
|
||||||
|
deps=deco_labels,
|
||||||
|
heads=rel_proj_heads,
|
||||||
|
)
|
||||||
nonproj.deprojectivize(doc)
|
nonproj.deprojectivize(doc)
|
||||||
return [t.head.i for t in doc], [token.dep_ for token in doc]
|
return [t.head.i for t in doc], [token.dep_ for token in doc]
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
tree = [1, 2, 2]
|
tree = [1, 2, 2]
|
||||||
nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
|
nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
|
||||||
nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
|
nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
|
||||||
labels = ['det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj', 'acl', 'punct']
|
labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
|
||||||
labels2 = ['advmod', 'root', 'det', 'nsubj', 'advmod', 'det', 'dobj', 'det', 'nmod', 'aux', 'nmod', 'advmod', 'det', 'amod', 'punct']
|
labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
assert(nonproj.decompose('X||Y') == ('X','Y'))
|
assert nonproj.decompose("X||Y") == ("X", "Y")
|
||||||
assert(nonproj.decompose('X') == ('X',''))
|
assert nonproj.decompose("X") == ("X", "")
|
||||||
assert(nonproj.is_decorated('X||Y') == True)
|
assert nonproj.is_decorated("X||Y") == True
|
||||||
assert(nonproj.is_decorated('X') == False)
|
assert nonproj.is_decorated("X") == False
|
||||||
|
|
||||||
nonproj._lift(0, tree)
|
nonproj._lift(0, tree)
|
||||||
assert(tree == [2, 2, 2])
|
assert tree == [2, 2, 2]
|
||||||
|
|
||||||
assert(nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7)
|
assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
|
||||||
assert(nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10)
|
assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
|
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
|
||||||
assert(proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2])
|
assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
||||||
assert(deco_labels == ['det', 'nsubj', 'root', 'det', 'dobj', 'aux',
|
assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
|
||||||
'nsubj', 'acl||dobj', 'punct'])
|
"nsubj", "acl||dobj", "punct"]
|
||||||
|
|
||||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||||
assert(deproj_heads == nonproj_tree)
|
assert deproj_heads == nonproj_tree
|
||||||
assert(undeco_labels == labels)
|
assert undeco_labels == labels
|
||||||
|
|
||||||
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2)
|
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2)
|
||||||
assert(proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1])
|
assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
|
||||||
assert(deco_labels == ['advmod||aux', 'root', 'det', 'nsubj', 'advmod',
|
assert deco_labels == ["advmod||aux", "root", "det", "nsubj", "advmod",
|
||||||
'det', 'dobj', 'det', 'nmod', 'aux', 'nmod||dobj',
|
"det", "dobj", "det", "nmod", "aux", "nmod||dobj",
|
||||||
'advmod', 'det', 'amod', 'punct'])
|
"advmod", "det", "amod", "punct"]
|
||||||
|
|
||||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||||
assert(deproj_heads == nonproj_tree2)
|
assert deproj_heads == nonproj_tree2
|
||||||
assert(undeco_labels == labels2)
|
assert undeco_labels == labels2
|
||||||
|
|
||||||
# if decoration is wrong such that there is no head with the desired label
|
# if decoration is wrong such that there is no head with the desired label
|
||||||
# the structure is kept and the label is undecorated
|
# the structure is kept and the label is undecorated
|
||||||
proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
|
||||||
deco_labels = ['det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj',
|
deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj",
|
||||||
'acl||iobj', 'punct']
|
"acl||iobj", "punct"]
|
||||||
|
|
||||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||||
assert(deproj_heads == proj_heads)
|
assert deproj_heads == proj_heads
|
||||||
assert(undeco_labels == ['det', 'nsubj', 'root', 'det', 'dobj', 'aux',
|
assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
|
||||||
'nsubj', 'acl', 'punct'])
|
"nsubj", "acl", "punct"]
|
||||||
|
|
||||||
# if there are two potential new heads, the first one is chosen even if
|
# if there are two potential new heads, the first one is chosen even if
|
||||||
# it's wrong
|
# it"s wrong
|
||||||
proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
|
proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
|
||||||
deco_labels = ['advmod||aux', 'root', 'det', 'aux', 'advmod', 'det',
|
deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det",
|
||||||
'dobj', 'det', 'nmod', 'aux', 'nmod||dobj', 'advmod',
|
"dobj", "det", "nmod", "aux", "nmod||dobj", "advmod",
|
||||||
'det', 'amod', 'punct']
|
"det", "amod", "punct"]
|
||||||
|
|
||||||
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
|
||||||
assert(deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1])
|
assert deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
|
||||||
assert(undeco_labels == ['advmod', 'root', 'det', 'aux', 'advmod', 'det',
|
assert undeco_labels == ["advmod", "root", "det", "aux", "advmod", "det",
|
||||||
'dobj', 'det', 'nmod', 'aux', 'nmod', 'advmod',
|
"dobj", "det", "nmod", "aux", "nmod", "advmod",
|
||||||
'det', 'amod', 'punct'])
|
"det", "amod", "punct"]
|
||||||
|
# fmt: on
|
||||||
|
|
|
@ -9,7 +9,7 @@ from ..util import get_doc, apply_transition_sequence
|
||||||
def test_parser_root(en_tokenizer):
|
def test_parser_root(en_tokenizer):
|
||||||
text = "i don't have other assistance"
|
text = "i don't have other assistance"
|
||||||
heads = [3, 2, 1, 0, 1, -2]
|
heads = [3, 2, 1, 0, 1, -2]
|
||||||
deps = ['nsubj', 'aux', 'neg', 'ROOT', 'amod', 'dobj']
|
deps = ["nsubj", "aux", "neg", "ROOT", "amod", "dobj"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
for t in doc:
|
for t in doc:
|
||||||
|
@ -17,10 +17,12 @@ def test_parser_root(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize("text", ["Hello"])
|
||||||
def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=['ROOT'])
|
doc = get_doc(
|
||||||
|
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
|
||||||
|
)
|
||||||
|
|
||||||
assert len(doc) == 1
|
assert len(doc) == 1
|
||||||
with en_parser.step_through(doc) as _:
|
with en_parser.step_through(doc) as _:
|
||||||
|
@ -32,7 +34,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
|
||||||
def test_parser_initial(en_tokenizer, en_parser):
|
def test_parser_initial(en_tokenizer, en_parser):
|
||||||
text = "I ate the pizza with anchovies."
|
text = "I ate the pizza with anchovies."
|
||||||
heads = [1, 0, 1, -2, -3, -1, -5]
|
heads = [1, 0, 1, -2, -3, -1, -5]
|
||||||
transition = ['L-nsubj', 'S', 'L-det']
|
transition = ["L-nsubj", "S", "L-det"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
apply_transition_sequence(en_parser, tokens, transition)
|
apply_transition_sequence(en_parser, tokens, transition)
|
||||||
assert tokens[0].head.i == 1
|
assert tokens[0].head.i == 1
|
||||||
|
@ -58,17 +60,19 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser):
|
||||||
def test_parser_merge_pp(en_tokenizer):
|
def test_parser_merge_pp(en_tokenizer):
|
||||||
text = "A phrase with another phrase occurs"
|
text = "A phrase with another phrase occurs"
|
||||||
heads = [1, 4, -1, 1, -2, 0]
|
heads = [1, 4, -1, 1, -2, 0]
|
||||||
deps = ['det', 'nsubj', 'prep', 'det', 'pobj', 'ROOT']
|
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"]
|
||||||
tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ']
|
tags = ["DT", "NN", "IN", "DT", "NN", "VBZ"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags)
|
doc = get_doc(
|
||||||
|
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
|
||||||
|
)
|
||||||
nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks]
|
nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks]
|
||||||
for start, end, lemma in nps:
|
for start, end, lemma in nps:
|
||||||
doc.merge(start, end, label='NP', lemma=lemma)
|
doc.merge(start, end, label="NP", lemma=lemma)
|
||||||
assert doc[0].text == 'A phrase'
|
assert doc[0].text == "A phrase"
|
||||||
assert doc[1].text == 'with'
|
assert doc[1].text == "with"
|
||||||
assert doc[2].text == 'another phrase'
|
assert doc[2].text == "another phrase"
|
||||||
assert doc[3].text == 'occurs'
|
assert doc[3].text == "occurs"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
|
@ -76,7 +80,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
||||||
text = "a b c d e"
|
text = "a b c d e"
|
||||||
|
|
||||||
# right branching
|
# right branching
|
||||||
transition = ['R-nsubj', 'D', 'R-nsubj', 'R-nsubj', 'D', 'R-ROOT']
|
transition = ["R-nsubj", "D", "R-nsubj", "R-nsubj", "D", "R-ROOT"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
apply_transition_sequence(en_parser, tokens, transition)
|
apply_transition_sequence(en_parser, tokens, transition)
|
||||||
|
|
||||||
|
@ -111,7 +115,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
|
||||||
assert tokens[4].head.i == 2
|
assert tokens[4].head.i == 2
|
||||||
|
|
||||||
# left branching
|
# left branching
|
||||||
transition = ['S', 'S', 'S', 'L-nsubj','L-nsubj','L-nsubj', 'L-nsubj']
|
transition = ["S", "S", "S", "L-nsubj", "L-nsubj", "L-nsubj", "L-nsubj"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
apply_transition_sequence(en_parser, tokens, transition)
|
apply_transition_sequence(en_parser, tokens, transition)
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,7 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def heads():
|
def heads():
|
||||||
|
# fmt: off
|
||||||
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
|
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
|
||||||
-1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
|
-1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
|
||||||
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
|
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
|
||||||
|
@ -50,6 +51,7 @@ def heads():
|
||||||
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
|
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
|
||||||
1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
|
1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
|
||||||
-1, -8, -9, -1]
|
-1, -8, -9, -1]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
|
||||||
|
@ -100,7 +102,14 @@ def test_parser_parse_navigate_edges(en_tokenizer, text, heads):
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
|
||||||
for token in doc:
|
for token in doc:
|
||||||
subtree = list(token.subtree)
|
subtree = list(token.subtree)
|
||||||
debug = '\t'.join((token.text, token.left_edge.text, subtree[0].text))
|
debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
|
||||||
assert token.left_edge == subtree[0], debug
|
assert token.left_edge == subtree[0], debug
|
||||||
debug = '\t'.join((token.text, token.right_edge.text, subtree[-1].text, token.right_edge.head.text))
|
debug = "\t".join(
|
||||||
|
(
|
||||||
|
token.text,
|
||||||
|
token.right_edge.text,
|
||||||
|
subtree[-1].text,
|
||||||
|
token.right_edge.head.text,
|
||||||
|
)
|
||||||
|
)
|
||||||
assert token.right_edge == subtree[-1], debug
|
assert token.right_edge == subtree[-1], debug
|
||||||
|
|
|
@ -19,34 +19,33 @@ def vocab():
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(vocab):
|
def parser(vocab):
|
||||||
parser = DependencyParser(vocab)
|
parser = DependencyParser(vocab)
|
||||||
parser.cfg['token_vector_width'] = 4
|
parser.cfg["token_vector_width"] = 4
|
||||||
parser.cfg['hidden_width'] = 32
|
parser.cfg["hidden_width"] = 32
|
||||||
#parser.add_label('right')
|
# parser.add_label('right')
|
||||||
parser.add_label('left')
|
parser.add_label("left")
|
||||||
parser.begin_training([], **parser.cfg)
|
parser.begin_training([], **parser.cfg)
|
||||||
sgd = Adam(NumpyOps(), 0.001)
|
sgd = Adam(NumpyOps(), 0.001)
|
||||||
|
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
losses = {}
|
losses = {}
|
||||||
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(vocab, words=["a", "b", "c", "d"])
|
||||||
gold = GoldParse(doc, heads=[1, 1, 3, 3],
|
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
|
||||||
deps=['left', 'ROOT', 'left', 'ROOT'])
|
|
||||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
def test_no_sentences(parser):
|
def test_no_sentences(parser):
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
assert len(list(doc.sents)) >= 1
|
assert len(list(doc.sents)) >= 1
|
||||||
|
|
||||||
|
|
||||||
def test_sents_1(parser):
|
def test_sents_1(parser):
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
doc[2].sent_start = True
|
doc[2].sent_start = True
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
assert len(list(doc.sents)) >= 2
|
assert len(list(doc.sents)) >= 2
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
doc[1].sent_start = False
|
doc[1].sent_start = False
|
||||||
doc[2].sent_start = True
|
doc[2].sent_start = True
|
||||||
doc[3].sent_start = False
|
doc[3].sent_start = False
|
||||||
|
@ -55,7 +54,7 @@ def test_sents_1(parser):
|
||||||
|
|
||||||
|
|
||||||
def test_sents_1_2(parser):
|
def test_sents_1_2(parser):
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
doc[1].sent_start = True
|
doc[1].sent_start = True
|
||||||
doc[2].sent_start = True
|
doc[2].sent_start = True
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
|
@ -63,12 +62,12 @@ def test_sents_1_2(parser):
|
||||||
|
|
||||||
|
|
||||||
def test_sents_1_3(parser):
|
def test_sents_1_3(parser):
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
doc[1].sent_start = True
|
doc[1].sent_start = True
|
||||||
doc[3].sent_start = True
|
doc[3].sent_start = True
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
assert len(list(doc.sents)) >= 3
|
assert len(list(doc.sents)) >= 3
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
doc[1].sent_start = True
|
doc[1].sent_start = True
|
||||||
doc[2].sent_start = False
|
doc[2].sent_start = False
|
||||||
doc[3].sent_start = True
|
doc[3].sent_start = True
|
||||||
|
|
|
@ -19,11 +19,13 @@ def test_parser_space_attachment(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
def test_parser_sentence_space(en_tokenizer):
|
def test_parser_sentence_space(en_tokenizer):
|
||||||
|
# fmt: off
|
||||||
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
|
||||||
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
|
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
|
||||||
deps = ['nsubj', 'ROOT', 'advmod', 'prep', 'pcomp', 'dobj', 'punct', '',
|
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
|
||||||
'nsubjpass', 'aux', 'auxpass', 'ROOT', 'nsubj', 'aux', 'ccomp',
|
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
|
||||||
'poss', 'nsubj', 'ccomp', 'punct']
|
"poss", "nsubj", "ccomp", "punct"]
|
||||||
|
# fmt: on
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
|
||||||
assert len(list(doc.sents)) == 2
|
assert len(list(doc.sents)) == 2
|
||||||
|
@ -34,10 +36,10 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
|
||||||
text = "\t \n This is a sentence ."
|
text = "\t \n This is a sentence ."
|
||||||
heads = [1, 1, 0, 1, -2, -3]
|
heads = [1, 1, 0, 1, -2, -3]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=text.split(' '), heads=heads)
|
doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
|
||||||
assert doc[0].is_space
|
assert doc[0].is_space
|
||||||
assert doc[1].is_space
|
assert doc[1].is_space
|
||||||
assert doc[2].text == 'This'
|
assert doc[2].text == "This"
|
||||||
with en_parser.step_through(doc) as stepwise:
|
with en_parser.step_through(doc) as stepwise:
|
||||||
pass
|
pass
|
||||||
assert doc[0].head.i == 2
|
assert doc[0].head.i == 2
|
||||||
|
@ -49,9 +51,9 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
|
||||||
def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
||||||
text = "This is \t a \t\n \n sentence . \n\n \n"
|
text = "This is \t a \t\n \n sentence . \n\n \n"
|
||||||
heads = [1, 0, -1, 2, -1, -4, -5, -1]
|
heads = [1, 0, -1, 2, -1, -4, -5, -1]
|
||||||
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct']
|
transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=text.split(' '), heads=heads)
|
doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
|
||||||
assert doc[2].is_space
|
assert doc[2].is_space
|
||||||
assert doc[4].is_space
|
assert doc[4].is_space
|
||||||
assert doc[5].is_space
|
assert doc[5].is_space
|
||||||
|
@ -64,8 +66,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
|
||||||
assert [token.head.i for token in doc] == [1, 1, 1, 6, 3, 3, 1, 1, 7, 7]
|
assert [token.head.i for token in doc] == [1, 1, 1, 6, 3, 3, 1, 1, 7, 7]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [(['\n'], 1),
|
@pytest.mark.parametrize("text,length", [(["\n"], 1), (["\n", "\t", "\n\n", "\t"], 4)])
|
||||||
(['\n', '\t', '\n\n', '\t'], 4)])
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
|
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
|
||||||
doc = Doc(en_parser.vocab, words=text)
|
doc = Doc(en_parser.vocab, words=text)
|
||||||
|
@ -74,4 +75,4 @@ def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
|
||||||
pass
|
pass
|
||||||
assert doc[0].is_space
|
assert doc[0].is_space
|
||||||
for token in doc:
|
for token in doc:
|
||||||
assert token.head.i == length-1
|
assert token.head.i == length - 1
|
||||||
|
|
|
@ -18,14 +18,16 @@ def patterns():
|
||||||
{"label": "HELLO", "pattern": "hello world"},
|
{"label": "HELLO", "pattern": "hello world"},
|
||||||
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
|
||||||
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
|
||||||
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}
|
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def add_ent():
|
def add_ent():
|
||||||
def add_ent_component(doc):
|
def add_ent_component(doc):
|
||||||
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings['ORG'])]
|
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
return add_ent_component
|
return add_ent_component
|
||||||
|
|
||||||
|
|
||||||
|
@ -33,13 +35,13 @@ def test_entity_ruler_init(nlp, patterns):
|
||||||
ruler = EntityRuler(nlp, patterns=patterns)
|
ruler = EntityRuler(nlp, patterns=patterns)
|
||||||
assert len(ruler) == len(patterns)
|
assert len(ruler) == len(patterns)
|
||||||
assert len(ruler.labels) == 3
|
assert len(ruler.labels) == 3
|
||||||
assert 'HELLO' in ruler
|
assert "HELLO" in ruler
|
||||||
assert 'BYE' in ruler
|
assert "BYE" in ruler
|
||||||
nlp.add_pipe(ruler)
|
nlp.add_pipe(ruler)
|
||||||
doc = nlp("hello world bye bye")
|
doc = nlp("hello world bye bye")
|
||||||
assert len(doc.ents) == 2
|
assert len(doc.ents) == 2
|
||||||
assert doc.ents[0].label_ == 'HELLO'
|
assert doc.ents[0].label_ == "HELLO"
|
||||||
assert doc.ents[1].label_ == 'BYE'
|
assert doc.ents[1].label_ == "BYE"
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing(nlp, patterns, add_ent):
|
def test_entity_ruler_existing(nlp, patterns, add_ent):
|
||||||
|
@ -48,8 +50,8 @@ def test_entity_ruler_existing(nlp, patterns, add_ent):
|
||||||
nlp.add_pipe(ruler)
|
nlp.add_pipe(ruler)
|
||||||
doc = nlp("OH HELLO WORLD bye bye")
|
doc = nlp("OH HELLO WORLD bye bye")
|
||||||
assert len(doc.ents) == 2
|
assert len(doc.ents) == 2
|
||||||
assert doc.ents[0].label_ == 'ORG'
|
assert doc.ents[0].label_ == "ORG"
|
||||||
assert doc.ents[1].label_ == 'BYE'
|
assert doc.ents[1].label_ == "BYE"
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing_overwrite(nlp, patterns, add_ent):
|
def test_entity_ruler_existing_overwrite(nlp, patterns, add_ent):
|
||||||
|
@ -58,9 +60,9 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, add_ent):
|
||||||
nlp.add_pipe(ruler)
|
nlp.add_pipe(ruler)
|
||||||
doc = nlp("OH HELLO WORLD bye bye")
|
doc = nlp("OH HELLO WORLD bye bye")
|
||||||
assert len(doc.ents) == 2
|
assert len(doc.ents) == 2
|
||||||
assert doc.ents[0].label_ == 'HELLO'
|
assert doc.ents[0].label_ == "HELLO"
|
||||||
assert doc.ents[0].text == 'HELLO'
|
assert doc.ents[0].text == "HELLO"
|
||||||
assert doc.ents[1].label_ == 'BYE'
|
assert doc.ents[1].label_ == "BYE"
|
||||||
|
|
||||||
|
|
||||||
def test_entity_ruler_existing_complex(nlp, patterns, add_ent):
|
def test_entity_ruler_existing_complex(nlp, patterns, add_ent):
|
||||||
|
@ -69,8 +71,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, add_ent):
|
||||||
nlp.add_pipe(ruler)
|
nlp.add_pipe(ruler)
|
||||||
doc = nlp("foo foo bye bye")
|
doc = nlp("foo foo bye bye")
|
||||||
assert len(doc.ents) == 2
|
assert len(doc.ents) == 2
|
||||||
assert doc.ents[0].label_ == 'COMPLEX'
|
assert doc.ents[0].label_ == "COMPLEX"
|
||||||
assert doc.ents[1].label_ == 'BYE'
|
assert doc.ents[1].label_ == "BYE"
|
||||||
assert len(doc.ents[0]) == 2
|
assert len(doc.ents[0]) == 2
|
||||||
assert len(doc.ents[1]) == 2
|
assert len(doc.ents[1]) == 2
|
||||||
|
|
||||||
|
|
|
@ -10,15 +10,21 @@ from ..util import get_doc
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def doc(en_tokenizer):
|
def doc(en_tokenizer):
|
||||||
text = 'I like New York in Autumn.'
|
text = "I like New York in Autumn."
|
||||||
heads = [1, 0, 1, -2, -3, -1, -5]
|
heads = [1, 0, 1, -2, -3, -1, -5]
|
||||||
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
|
tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
|
||||||
pos = ['PRON', 'VERB', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT']
|
pos = ["PRON", "VERB", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
|
||||||
deps = ['ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
|
deps = ["ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads,
|
doc = get_doc(
|
||||||
tags=tags, pos=pos, deps=deps)
|
tokens.vocab,
|
||||||
doc.ents = [Span(doc, 2, 4, doc.vocab.strings['GPE'])]
|
words=[t.text for t in tokens],
|
||||||
|
heads=heads,
|
||||||
|
tags=tags,
|
||||||
|
pos=pos,
|
||||||
|
deps=deps,
|
||||||
|
)
|
||||||
|
doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
|
||||||
doc.is_parsed = True
|
doc.is_parsed = True
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
return doc
|
return doc
|
||||||
|
@ -27,18 +33,18 @@ def doc(en_tokenizer):
|
||||||
def test_factories_merge_noun_chunks(doc):
|
def test_factories_merge_noun_chunks(doc):
|
||||||
assert len(doc) == 7
|
assert len(doc) == 7
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
merge_noun_chunks = nlp.create_pipe('merge_noun_chunks')
|
merge_noun_chunks = nlp.create_pipe("merge_noun_chunks")
|
||||||
merge_noun_chunks(doc)
|
merge_noun_chunks(doc)
|
||||||
assert len(doc) == 6
|
assert len(doc) == 6
|
||||||
assert doc[2].text == 'New York'
|
assert doc[2].text == "New York"
|
||||||
|
|
||||||
|
|
||||||
def test_factories_merge_ents(doc):
|
def test_factories_merge_ents(doc):
|
||||||
assert len(doc) == 7
|
assert len(doc) == 7
|
||||||
assert len(list(doc.ents)) == 1
|
assert len(list(doc.ents)) == 1
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
merge_entities = nlp.create_pipe('merge_entities')
|
merge_entities = nlp.create_pipe("merge_entities")
|
||||||
merge_entities(doc)
|
merge_entities(doc)
|
||||||
assert len(doc) == 6
|
assert len(doc) == 6
|
||||||
assert len(list(doc.ents)) == 1
|
assert len(list(doc.ents)) == 1
|
||||||
assert doc[2].text == 'New York'
|
assert doc[2].text == "New York"
|
||||||
|
|
|
@ -16,22 +16,22 @@ def new_pipe(doc):
|
||||||
|
|
||||||
def test_add_pipe_no_name(nlp):
|
def test_add_pipe_no_name(nlp):
|
||||||
nlp.add_pipe(new_pipe)
|
nlp.add_pipe(new_pipe)
|
||||||
assert 'new_pipe' in nlp.pipe_names
|
assert "new_pipe" in nlp.pipe_names
|
||||||
|
|
||||||
|
|
||||||
def test_add_pipe_duplicate_name(nlp):
|
def test_add_pipe_duplicate_name(nlp):
|
||||||
nlp.add_pipe(new_pipe, name='duplicate_name')
|
nlp.add_pipe(new_pipe, name="duplicate_name")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.add_pipe(new_pipe, name='duplicate_name')
|
nlp.add_pipe(new_pipe, name="duplicate_name")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('name', ['parser'])
|
@pytest.mark.parametrize("name", ["parser"])
|
||||||
def test_add_pipe_first(nlp, name):
|
def test_add_pipe_first(nlp, name):
|
||||||
nlp.add_pipe(new_pipe, name=name, first=True)
|
nlp.add_pipe(new_pipe, name=name, first=True)
|
||||||
assert nlp.pipeline[0][0] == name
|
assert nlp.pipeline[0][0] == name
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')])
|
@pytest.mark.parametrize("name1,name2", [("parser", "lambda_pipe")])
|
||||||
def test_add_pipe_last(nlp, name1, name2):
|
def test_add_pipe_last(nlp, name1, name2):
|
||||||
nlp.add_pipe(lambda doc: doc, name=name2)
|
nlp.add_pipe(lambda doc: doc, name=name2)
|
||||||
nlp.add_pipe(new_pipe, name=name1, last=True)
|
nlp.add_pipe(new_pipe, name=name1, last=True)
|
||||||
|
@ -44,7 +44,7 @@ def test_cant_add_pipe_first_and_last(nlp):
|
||||||
nlp.add_pipe(new_pipe, first=True, last=True)
|
nlp.add_pipe(new_pipe, first=True, last=True)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('name', ['my_component'])
|
@pytest.mark.parametrize("name", ["my_component"])
|
||||||
def test_get_pipe(nlp, name):
|
def test_get_pipe(nlp, name):
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
nlp.get_pipe(name)
|
nlp.get_pipe(name)
|
||||||
|
@ -52,7 +52,7 @@ def test_get_pipe(nlp, name):
|
||||||
assert nlp.get_pipe(name) == new_pipe
|
assert nlp.get_pipe(name) == new_pipe
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)])
|
@pytest.mark.parametrize("name,replacement", [("my_component", lambda doc: doc)])
|
||||||
def test_replace_pipe(nlp, name, replacement):
|
def test_replace_pipe(nlp, name, replacement):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.replace_pipe(name, new_pipe)
|
nlp.replace_pipe(name, new_pipe)
|
||||||
|
@ -62,7 +62,7 @@ def test_replace_pipe(nlp, name, replacement):
|
||||||
assert nlp.get_pipe(name) == replacement
|
assert nlp.get_pipe(name) == replacement
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')])
|
@pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")])
|
||||||
def test_rename_pipe(nlp, old_name, new_name):
|
def test_rename_pipe(nlp, old_name, new_name):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.rename_pipe(old_name, new_name)
|
nlp.rename_pipe(old_name, new_name)
|
||||||
|
@ -71,7 +71,7 @@ def test_rename_pipe(nlp, old_name, new_name):
|
||||||
assert nlp.pipeline[0][0] == new_name
|
assert nlp.pipeline[0][0] == new_name
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('name', ['my_component'])
|
@pytest.mark.parametrize("name", ["my_component"])
|
||||||
def test_remove_pipe(nlp, name):
|
def test_remove_pipe(nlp, name):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.remove_pipe(name)
|
nlp.remove_pipe(name)
|
||||||
|
@ -83,7 +83,7 @@ def test_remove_pipe(nlp, name):
|
||||||
assert removed_component == new_pipe
|
assert removed_component == new_pipe
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('name', ['my_component'])
|
@pytest.mark.parametrize("name", ["my_component"])
|
||||||
def test_disable_pipes_method(nlp, name):
|
def test_disable_pipes_method(nlp, name):
|
||||||
nlp.add_pipe(new_pipe, name=name)
|
nlp.add_pipe(new_pipe, name=name)
|
||||||
assert nlp.has_pipe(name)
|
assert nlp.has_pipe(name)
|
||||||
|
@ -92,7 +92,7 @@ def test_disable_pipes_method(nlp, name):
|
||||||
disabled.restore()
|
disabled.restore()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('name', ['my_component'])
|
@pytest.mark.parametrize("name", ["my_component"])
|
||||||
def test_disable_pipes_context(nlp, name):
|
def test_disable_pipes_context(nlp, name):
|
||||||
nlp.add_pipe(new_pipe, name=name)
|
nlp.add_pipe(new_pipe, name=name)
|
||||||
assert nlp.has_pipe(name)
|
assert nlp.has_pipe(name)
|
||||||
|
@ -101,14 +101,14 @@ def test_disable_pipes_context(nlp, name):
|
||||||
assert nlp.has_pipe(name)
|
assert nlp.has_pipe(name)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('n_pipes', [100])
|
@pytest.mark.parametrize("n_pipes", [100])
|
||||||
def test_add_lots_of_pipes(nlp, n_pipes):
|
def test_add_lots_of_pipes(nlp, n_pipes):
|
||||||
for i in range(n_pipes):
|
for i in range(n_pipes):
|
||||||
nlp.add_pipe(lambda doc: doc, name='pipe_%d' % i)
|
nlp.add_pipe(lambda doc: doc, name="pipe_%d" % i)
|
||||||
assert len(nlp.pipe_names) == n_pipes
|
assert len(nlp.pipe_names) == n_pipes
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('component', ['ner', {'hello': 'world'}])
|
@pytest.mark.parametrize("component", ["ner", {"hello": "world"}])
|
||||||
def test_raise_for_invalid_components(nlp, component):
|
def test_raise_for_invalid_components(nlp, component):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.add_pipe(component)
|
nlp.add_pipe(component)
|
||||||
|
|
|
@ -13,16 +13,21 @@ from spacy.gold import GoldParse
|
||||||
@pytest.mark.skip(reason="Test is flakey when run with others")
|
@pytest.mark.skip(reason="Test is flakey when run with others")
|
||||||
def test_simple_train():
|
def test_simple_train():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe(nlp.create_pipe('textcat'))
|
nlp.add_pipe(nlp.create_pipe("textcat"))
|
||||||
nlp.get_pipe('textcat').add_label('answer')
|
nlp.get_pipe("textcat").add_label("answer")
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
for text, answer in [('aaaa', 1.), ('bbbb', 0), ('aa', 1.),
|
for text, answer in [
|
||||||
('bbbbbbbbb', 0.), ('aaaaaa', 1)]:
|
("aaaa", 1.0),
|
||||||
nlp.update([text], [{'cats': {'answer': answer}}])
|
("bbbb", 0),
|
||||||
doc = nlp('aaa')
|
("aa", 1.0),
|
||||||
assert 'answer' in doc.cats
|
("bbbbbbbbb", 0.0),
|
||||||
assert doc.cats['answer'] >= 0.5
|
("aaaaaa", 1),
|
||||||
|
]:
|
||||||
|
nlp.update([text], [{"cats": {"answer": answer}}])
|
||||||
|
doc = nlp("aaa")
|
||||||
|
assert "answer" in doc.cats
|
||||||
|
assert doc.cats["answer"] >= 0.5
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Test is flakey when run with others")
|
@pytest.mark.skip(reason="Test is flakey when run with others")
|
||||||
|
@ -31,11 +36,11 @@ def test_textcat_learns_multilabel():
|
||||||
numpy.random.seed(5)
|
numpy.random.seed(5)
|
||||||
docs = []
|
docs = []
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
letters = ['a', 'b', 'c']
|
letters = ["a", "b", "c"]
|
||||||
for w1 in letters:
|
for w1 in letters:
|
||||||
for w2 in letters:
|
for w2 in letters:
|
||||||
cats = {letter: float(w2==letter) for letter in letters}
|
cats = {letter: float(w2 == letter) for letter in letters}
|
||||||
docs.append((Doc(nlp.vocab, words=['d']*3 + [w1, w2] + ['d']*3), cats))
|
docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
|
||||||
random.shuffle(docs)
|
random.shuffle(docs)
|
||||||
model = TextCategorizer(nlp.vocab, width=8)
|
model = TextCategorizer(nlp.vocab, width=8)
|
||||||
for letter in letters:
|
for letter in letters:
|
||||||
|
@ -49,8 +54,8 @@ def test_textcat_learns_multilabel():
|
||||||
random.shuffle(docs)
|
random.shuffle(docs)
|
||||||
for w1 in letters:
|
for w1 in letters:
|
||||||
for w2 in letters:
|
for w2 in letters:
|
||||||
doc = Doc(nlp.vocab, words=['d']*3 + [w1, w2] + ['d']*3)
|
doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
|
||||||
truth = {letter: w2==letter for letter in letters}
|
truth = {letter: w2 == letter for letter in letters}
|
||||||
model(doc)
|
model(doc)
|
||||||
for cat, score in doc.cats.items():
|
for cat, score in doc.cats.items():
|
||||||
if not truth[cat]:
|
if not truth[cat]:
|
||||||
|
|
|
@ -14,14 +14,20 @@ from spacy.tokens import Doc
|
||||||
from ..util import get_doc, make_tempdir
|
from ..util import get_doc, make_tempdir
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('patterns', [
|
@pytest.mark.parametrize(
|
||||||
[[{'LOWER': 'celtics'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
|
"patterns",
|
||||||
[[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'celtics'}]]])
|
[
|
||||||
|
[[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
|
||||||
|
[[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]],
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_issue118(en_tokenizer, patterns):
|
def test_issue118(en_tokenizer, patterns):
|
||||||
"""Test a bug that arose from having overlapping matches"""
|
"""Test a bug that arose from having overlapping matches"""
|
||||||
text = "how many points did lebron james score against the boston celtics last night"
|
text = (
|
||||||
|
"how many points did lebron james score against the boston celtics last night"
|
||||||
|
)
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
ORG = doc.vocab.strings['ORG']
|
ORG = doc.vocab.strings["ORG"]
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add("BostonCeltics", None, *patterns)
|
matcher.add("BostonCeltics", None, *patterns)
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
|
@ -35,16 +41,22 @@ def test_issue118(en_tokenizer, patterns):
|
||||||
assert ents[0].end == 11
|
assert ents[0].end == 11
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('patterns', [
|
@pytest.mark.parametrize(
|
||||||
[[{'LOWER': 'boston'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
|
"patterns",
|
||||||
[[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'boston'}]]])
|
[
|
||||||
|
[[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
|
||||||
|
[[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]],
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_issue118_prefix_reorder(en_tokenizer, patterns):
|
def test_issue118_prefix_reorder(en_tokenizer, patterns):
|
||||||
"""Test a bug that arose from having overlapping matches"""
|
"""Test a bug that arose from having overlapping matches"""
|
||||||
text = "how many points did lebron james score against the boston celtics last night"
|
text = (
|
||||||
|
"how many points did lebron james score against the boston celtics last night"
|
||||||
|
)
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
ORG = doc.vocab.strings['ORG']
|
ORG = doc.vocab.strings["ORG"]
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add('BostonCeltics', None, *patterns)
|
matcher.add("BostonCeltics", None, *patterns)
|
||||||
assert len(list(doc.ents)) == 0
|
assert len(list(doc.ents)) == 0
|
||||||
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
|
||||||
doc.ents += tuple(matches)[1:]
|
doc.ents += tuple(matches)[1:]
|
||||||
|
@ -59,11 +71,13 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns):
|
||||||
def test_issue242(en_tokenizer):
|
def test_issue242(en_tokenizer):
|
||||||
"""Test overlapping multi-word phrases."""
|
"""Test overlapping multi-word phrases."""
|
||||||
text = "There are different food safety standards in different countries."
|
text = "There are different food safety standards in different countries."
|
||||||
patterns = [[{'LOWER': 'food'}, {'LOWER': 'safety'}],
|
patterns = [
|
||||||
[{'LOWER': 'safety'}, {'LOWER': 'standards'}]]
|
[{"LOWER": "food"}, {"LOWER": "safety"}],
|
||||||
|
[{"LOWER": "safety"}, {"LOWER": "standards"}],
|
||||||
|
]
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add('FOOD', None, *patterns)
|
matcher.add("FOOD", None, *patterns)
|
||||||
|
|
||||||
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
|
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
|
||||||
doc.ents += tuple(matches)
|
doc.ents += tuple(matches)
|
||||||
|
@ -77,7 +91,9 @@ def test_issue242(en_tokenizer):
|
||||||
def test_issue309(en_tokenizer):
|
def test_issue309(en_tokenizer):
|
||||||
"""Test Issue #309: SBD fails on empty string"""
|
"""Test Issue #309: SBD fails on empty string"""
|
||||||
tokens = en_tokenizer(" ")
|
tokens = en_tokenizer(" ")
|
||||||
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=['ROOT'])
|
doc = get_doc(
|
||||||
|
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
|
||||||
|
)
|
||||||
doc.is_parsed = True
|
doc.is_parsed = True
|
||||||
assert len(doc) == 1
|
assert len(doc) == 1
|
||||||
sents = list(doc.sents)
|
sents = list(doc.sents)
|
||||||
|
@ -93,11 +109,11 @@ def test_issue351(en_tokenizer):
|
||||||
|
|
||||||
def test_issue360(en_tokenizer):
|
def test_issue360(en_tokenizer):
|
||||||
"""Test tokenization of big ellipsis"""
|
"""Test tokenization of big ellipsis"""
|
||||||
tokens = en_tokenizer('$45...............Asking')
|
tokens = en_tokenizer("$45...............Asking")
|
||||||
assert len(tokens) > 2
|
assert len(tokens) > 2
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text1,text2', [("cat", "dog")])
|
@pytest.mark.parametrize("text1,text2", [("cat", "dog")])
|
||||||
def test_issue361(en_vocab, text1, text2):
|
def test_issue361(en_vocab, text1, text2):
|
||||||
"""Test Issue #361: Equality of lexemes"""
|
"""Test Issue #361: Equality of lexemes"""
|
||||||
assert en_vocab[text1] == en_vocab[text1]
|
assert en_vocab[text1] == en_vocab[text1]
|
||||||
|
@ -106,15 +122,19 @@ def test_issue361(en_vocab, text1, text2):
|
||||||
|
|
||||||
def test_issue587(en_tokenizer):
|
def test_issue587(en_tokenizer):
|
||||||
"""Test that Matcher doesn't segfault on particular input"""
|
"""Test that Matcher doesn't segfault on particular input"""
|
||||||
doc = en_tokenizer('a b; c')
|
doc = en_tokenizer("a b; c")
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add('TEST1', None, [{ORTH: 'a'}, {ORTH: 'b'}])
|
matcher.add("TEST1", None, [{ORTH: "a"}, {ORTH: "b"}])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 1
|
assert len(matches) == 1
|
||||||
matcher.add('TEST2', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}])
|
matcher.add(
|
||||||
|
"TEST2", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]
|
||||||
|
)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
matcher.add('TEST3', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}])
|
matcher.add(
|
||||||
|
"TEST3", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]
|
||||||
|
)
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
|
||||||
|
@ -122,22 +142,26 @@ def test_issue587(en_tokenizer):
|
||||||
def test_issue588(en_vocab):
|
def test_issue588(en_vocab):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher.add('TEST', None, [])
|
matcher.add("TEST", None, [])
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
def test_issue589():
|
def test_issue589():
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
vocab.strings.set_frozen(True)
|
vocab.strings.set_frozen(True)
|
||||||
doc = Doc(vocab, words=['whata'])
|
doc = Doc(vocab, words=["whata"])
|
||||||
|
|
||||||
|
|
||||||
def test_issue590(en_vocab):
|
def test_issue590(en_vocab):
|
||||||
"""Test overlapping matches"""
|
"""Test overlapping matches"""
|
||||||
doc = Doc(en_vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%'])
|
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': ':'}, {'LIKE_NUM': True}, {'ORTH': '%'}])
|
matcher.add(
|
||||||
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': '='}, {'LIKE_NUM': True}])
|
"ab",
|
||||||
|
None,
|
||||||
|
[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}],
|
||||||
|
)
|
||||||
|
matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}])
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
|
|
||||||
|
@ -145,14 +169,14 @@ def test_issue590(en_vocab):
|
||||||
def test_issue595():
|
def test_issue595():
|
||||||
"""Test lemmatization of base forms"""
|
"""Test lemmatization of base forms"""
|
||||||
words = ["Do", "n't", "feed", "the", "dog"]
|
words = ["Do", "n't", "feed", "the", "dog"]
|
||||||
tag_map = {'VB': {POS: VERB, VerbForm_inf: True}}
|
tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
|
||||||
rules = {"verb": [["ed", "e"]]}
|
rules = {"verb": [["ed", "e"]]}
|
||||||
lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
|
lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules)
|
||||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||||
doc = Doc(vocab, words=words)
|
doc = Doc(vocab, words=words)
|
||||||
doc[2].tag_ = 'VB'
|
doc[2].tag_ = "VB"
|
||||||
assert doc[2].text == 'feed'
|
assert doc[2].text == "feed"
|
||||||
assert doc[2].lemma_ == 'feed'
|
assert doc[2].lemma_ == "feed"
|
||||||
|
|
||||||
|
|
||||||
def test_issue599(en_vocab):
|
def test_issue599(en_vocab):
|
||||||
|
@ -165,9 +189,9 @@ def test_issue599(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_issue600():
|
def test_issue600():
|
||||||
vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})
|
vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
|
||||||
doc = Doc(vocab, words=["hello"])
|
doc = Doc(vocab, words=["hello"])
|
||||||
doc[0].tag_ = 'NN'
|
doc[0].tag_ = "NN"
|
||||||
|
|
||||||
|
|
||||||
def test_issue615(en_tokenizer):
|
def test_issue615(en_tokenizer):
|
||||||
|
@ -175,16 +199,17 @@ def test_issue615(en_tokenizer):
|
||||||
"""Merge a phrase. We have to be careful here because we'll change the
|
"""Merge a phrase. We have to be careful here because we'll change the
|
||||||
token indices. To avoid problems, merge all the phrases once we're called
|
token indices. To avoid problems, merge all the phrases once we're called
|
||||||
on the last match."""
|
on the last match."""
|
||||||
if i != len(matches)-1:
|
if i != len(matches) - 1:
|
||||||
return None
|
return None
|
||||||
spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
|
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
|
||||||
for ent_id, label, span in spans:
|
for ent_id, label, span in spans:
|
||||||
span.merge(tag='NNP' if label else span.root.tag_, lemma=span.text,
|
span.merge(
|
||||||
label=label)
|
tag="NNP" if label else span.root.tag_, lemma=span.text, label=label
|
||||||
|
)
|
||||||
doc.ents = doc.ents + ((label, span.start, span.end),)
|
doc.ents = doc.ents + ((label, span.start, span.end),)
|
||||||
|
|
||||||
text = "The golf club is broken"
|
text = "The golf club is broken"
|
||||||
pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
|
pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
|
||||||
label = "Sport_Equipment"
|
label = "Sport_Equipment"
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
|
@ -195,7 +220,7 @@ def test_issue615(en_tokenizer):
|
||||||
assert entities[0].label != 0
|
assert entities[0].label != 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,number', [("7am", "7"), ("11p.m.", "11")])
|
@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")])
|
||||||
def test_issue736(en_tokenizer, text, number):
|
def test_issue736(en_tokenizer, text, number):
|
||||||
"""Test that times like "7am" are tokenized correctly and that numbers are
|
"""Test that times like "7am" are tokenized correctly and that numbers are
|
||||||
converted to string."""
|
converted to string."""
|
||||||
|
@ -204,7 +229,7 @@ def test_issue736(en_tokenizer, text, number):
|
||||||
assert tokens[0].text == number
|
assert tokens[0].text == number
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["3/4/2012", "01/12/1900"])
|
@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"])
|
||||||
def test_issue740(en_tokenizer, text):
|
def test_issue740(en_tokenizer, text):
|
||||||
"""Test that dates are not split and kept as one token. This behaviour is
|
"""Test that dates are not split and kept as one token. This behaviour is
|
||||||
currently inconsistent, since dates separated by hyphens are still split.
|
currently inconsistent, since dates separated by hyphens are still split.
|
||||||
|
@ -214,14 +239,14 @@ def test_issue740(en_tokenizer, text):
|
||||||
|
|
||||||
|
|
||||||
def test_issue743():
|
def test_issue743():
|
||||||
doc = Doc(Vocab(), ['hello', 'world'])
|
doc = Doc(Vocab(), ["hello", "world"])
|
||||||
token = doc[0]
|
token = doc[0]
|
||||||
s = set([token])
|
s = set([token])
|
||||||
items = list(s)
|
items = list(s)
|
||||||
assert items[0] is token
|
assert items[0] is token
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["We were scared", "We Were Scared"])
|
@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"])
|
||||||
def test_issue744(en_tokenizer, text):
|
def test_issue744(en_tokenizer, text):
|
||||||
"""Test that 'were' and 'Were' are excluded from the contractions
|
"""Test that 'were' and 'Were' are excluded from the contractions
|
||||||
generated by the English tokenizer exceptions."""
|
generated by the English tokenizer exceptions."""
|
||||||
|
@ -230,14 +255,15 @@ def test_issue744(en_tokenizer, text):
|
||||||
assert tokens[1].text.lower() == "were"
|
assert tokens[1].text.lower() == "were"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,is_num', [("one", True), ("ten", True),
|
@pytest.mark.parametrize(
|
||||||
("teneleven", False)])
|
"text,is_num", [("one", True), ("ten", True), ("teneleven", False)]
|
||||||
|
)
|
||||||
def test_issue759(en_tokenizer, text, is_num):
|
def test_issue759(en_tokenizer, text, is_num):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert tokens[0].like_num == is_num
|
assert tokens[0].like_num == is_num
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Shell", "shell", "Shed", "shed"])
|
@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"])
|
||||||
def test_issue775(en_tokenizer, text):
|
def test_issue775(en_tokenizer, text):
|
||||||
"""Test that 'Shell' and 'shell' are excluded from the contractions
|
"""Test that 'Shell' and 'shell' are excluded from the contractions
|
||||||
generated by the English tokenizer exceptions."""
|
generated by the English tokenizer exceptions."""
|
||||||
|
@ -246,28 +272,32 @@ def test_issue775(en_tokenizer, text):
|
||||||
assert tokens[0].text == text
|
assert tokens[0].text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"])
|
||||||
def test_issue792(en_tokenizer, text):
|
def test_issue792(en_tokenizer, text):
|
||||||
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert ''.join([token.text_with_ws for token in doc]) == text
|
assert "".join([token.text_with_ws for token in doc]) == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
|
@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"])
|
||||||
def test_control_issue792(en_tokenizer, text):
|
def test_control_issue792(en_tokenizer, text):
|
||||||
"""Test base case for Issue #792: Non-trailing whitespace"""
|
"""Test base case for Issue #792: Non-trailing whitespace"""
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert ''.join([token.text_with_ws for token in doc]) == text
|
assert "".join([token.text_with_ws for token in doc]) == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,tokens', [
|
@pytest.mark.parametrize(
|
||||||
('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
|
"text,tokens",
|
||||||
("exception;--exclusive", ["exception", ";--", "exclusive"]),
|
[
|
||||||
("day.--Is", ["day", ".--", "Is"]),
|
('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
|
||||||
("refinement:--just", ["refinement", ":--", "just"]),
|
("exception;--exclusive", ["exception", ";--", "exclusive"]),
|
||||||
("memories?--To", ["memories", "?--", "To"]),
|
("day.--Is", ["day", ".--", "Is"]),
|
||||||
("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
|
("refinement:--just", ["refinement", ":--", "just"]),
|
||||||
("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"])])
|
("memories?--To", ["memories", "?--", "To"]),
|
||||||
|
("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
|
||||||
|
("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_issue801(en_tokenizer, text, tokens):
|
def test_issue801(en_tokenizer, text, tokens):
|
||||||
"""Test that special characters + hyphens are split correctly."""
|
"""Test that special characters + hyphens are split correctly."""
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
|
@ -275,10 +305,19 @@ def test_issue801(en_tokenizer, text, tokens):
|
||||||
assert [t.text for t in doc] == tokens
|
assert [t.text for t in doc] == tokens
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', [
|
@pytest.mark.parametrize(
|
||||||
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
|
"text,expected_tokens",
|
||||||
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
|
[
|
||||||
])
|
(
|
||||||
|
"Smörsåsen används bl.a. till fisk",
|
||||||
|
["Smörsåsen", "används", "bl.a.", "till", "fisk"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"Jag kommer först kl. 13 p.g.a. diverse förseningar",
|
||||||
|
["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_issue805(sv_tokenizer, text, expected_tokens):
|
def test_issue805(sv_tokenizer, text, expected_tokens):
|
||||||
tokens = sv_tokenizer(text)
|
tokens = sv_tokenizer(text)
|
||||||
token_list = [token.text for token in tokens if not token.is_space]
|
token_list = [token.text for token in tokens if not token.is_space]
|
||||||
|
@ -291,9 +330,9 @@ def test_issue850():
|
||||||
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
||||||
matcher = Matcher(vocab)
|
matcher = Matcher(vocab)
|
||||||
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||||||
pattern = [{'LOWER': "bob"}, {'OP': '*', 'IS_ANY_TOKEN': True}, {'LOWER': 'frank'}]
|
pattern = [{"LOWER": "bob"}, {"OP": "*", "IS_ANY_TOKEN": True}, {"LOWER": "frank"}]
|
||||||
matcher.add('FarAway', None, pattern)
|
matcher.add("FarAway", None, pattern)
|
||||||
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
|
||||||
match = matcher(doc)
|
match = matcher(doc)
|
||||||
assert len(match) == 1
|
assert len(match) == 1
|
||||||
ent_id, start, end = match[0]
|
ent_id, start, end = match[0]
|
||||||
|
@ -306,9 +345,9 @@ def test_issue850_basic():
|
||||||
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
|
||||||
matcher = Matcher(vocab)
|
matcher = Matcher(vocab)
|
||||||
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
|
||||||
pattern = [{'LOWER': "bob"}, {'OP': '*', 'LOWER': 'and'}, {'LOWER': 'frank'}]
|
pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
|
||||||
matcher.add('FarAway', None, pattern)
|
matcher.add("FarAway", None, pattern)
|
||||||
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
|
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
|
||||||
match = matcher(doc)
|
match = matcher(doc)
|
||||||
assert len(match) == 1
|
assert len(match) == 1
|
||||||
ent_id, start, end = match[0]
|
ent_id, start, end = match[0]
|
||||||
|
@ -316,23 +355,25 @@ def test_issue850_basic():
|
||||||
assert end == 4
|
assert end == 4
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["au-delàs", "pair-programmâmes",
|
@pytest.mark.parametrize(
|
||||||
"terra-formées", "σ-compacts"])
|
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
|
||||||
|
)
|
||||||
def test_issue852(fr_tokenizer, text):
|
def test_issue852(fr_tokenizer, text):
|
||||||
"""Test that French tokenizer exceptions are imported correctly."""
|
"""Test that French tokenizer exceptions are imported correctly."""
|
||||||
tokens = fr_tokenizer(text)
|
tokens = fr_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
|
@pytest.mark.parametrize(
|
||||||
"aaabbb@ccc.com \nThank you!"])
|
"text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"]
|
||||||
|
)
|
||||||
def test_issue859(en_tokenizer, text):
|
def test_issue859(en_tokenizer, text):
|
||||||
"""Test that no extra space is added in doc.text method."""
|
"""Test that no extra space is added in doc.text method."""
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert doc.text == text
|
assert doc.text == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Datum:2014-06-02\nDokument:76467"])
|
@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"])
|
||||||
def test_issue886(en_tokenizer, text):
|
def test_issue886(en_tokenizer, text):
|
||||||
"""Test that token.idx matches the original text index for texts with newlines."""
|
"""Test that token.idx matches the original text index for texts with newlines."""
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
|
@ -341,7 +382,7 @@ def test_issue886(en_tokenizer, text):
|
||||||
assert text[token.idx] == token.text[0]
|
assert text[token.idx] == token.text[0]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["want/need"])
|
@pytest.mark.parametrize("text", ["want/need"])
|
||||||
def test_issue891(en_tokenizer, text):
|
def test_issue891(en_tokenizer, text):
|
||||||
"""Test that / infixes are split correctly."""
|
"""Test that / infixes are split correctly."""
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
|
@ -349,11 +390,10 @@ def test_issue891(en_tokenizer, text):
|
||||||
assert tokens[1].text == "/"
|
assert tokens[1].text == "/"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,tag,lemma', [
|
@pytest.mark.parametrize(
|
||||||
("anus", "NN", "anus"),
|
"text,tag,lemma",
|
||||||
("princess", "NN", "princess"),
|
[("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")],
|
||||||
("inner", "JJ", "inner")
|
)
|
||||||
])
|
|
||||||
def test_issue912(en_vocab, text, tag, lemma):
|
def test_issue912(en_vocab, text, tag, lemma):
|
||||||
"""Test base-forms are preserved."""
|
"""Test base-forms are preserved."""
|
||||||
doc = Doc(en_vocab, words=[text])
|
doc = Doc(en_vocab, words=[text])
|
||||||
|
@ -364,10 +404,10 @@ def test_issue912(en_vocab, text, tag, lemma):
|
||||||
def test_issue957(en_tokenizer):
|
def test_issue957(en_tokenizer):
|
||||||
"""Test that spaCy doesn't hang on many periods."""
|
"""Test that spaCy doesn't hang on many periods."""
|
||||||
# skip test if pytest-timeout is not installed
|
# skip test if pytest-timeout is not installed
|
||||||
timeout = pytest.importorskip('pytest-timeout')
|
timeout = pytest.importorskip("pytest-timeout")
|
||||||
string = '0'
|
string = "0"
|
||||||
for i in range(1, 100):
|
for i in range(1, 100):
|
||||||
string += '.%d' % i
|
string += ".%d" % i
|
||||||
doc = en_tokenizer(string)
|
doc = en_tokenizer(string)
|
||||||
|
|
||||||
|
|
||||||
|
@ -386,13 +426,13 @@ def test_issue999(train_data):
|
||||||
["hello", []],
|
["hello", []],
|
||||||
["hi", []],
|
["hi", []],
|
||||||
["i'm looking for a place to eat", []],
|
["i'm looking for a place to eat", []],
|
||||||
["i'm looking for a place in the north of town", [[31,36,"LOCATION"]]],
|
["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]],
|
||||||
["show me chinese restaurants", [[8,15,"CUISINE"]]],
|
["show me chinese restaurants", [[8, 15, "CUISINE"]]],
|
||||||
["show me chines restaurants", [[8,14,"CUISINE"]]],
|
["show me chines restaurants", [[8, 14, "CUISINE"]]],
|
||||||
]
|
]
|
||||||
|
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
ner = nlp.create_pipe('ner')
|
ner = nlp.create_pipe("ner")
|
||||||
nlp.add_pipe(ner)
|
nlp.add_pipe(ner)
|
||||||
for _, offsets in TRAIN_DATA:
|
for _, offsets in TRAIN_DATA:
|
||||||
for start, end, label in offsets:
|
for start, end, label in offsets:
|
||||||
|
@ -402,7 +442,7 @@ def test_issue999(train_data):
|
||||||
for itn in range(100):
|
for itn in range(100):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
for raw_text, entity_offsets in TRAIN_DATA:
|
for raw_text, entity_offsets in TRAIN_DATA:
|
||||||
nlp.update([raw_text], [{'entities': entity_offsets}])
|
nlp.update([raw_text], [{"entities": entity_offsets}])
|
||||||
|
|
||||||
with make_tempdir() as model_dir:
|
with make_tempdir() as model_dir:
|
||||||
nlp.to_disk(model_dir)
|
nlp.to_disk(model_dir)
|
||||||
|
|
|
@ -15,76 +15,84 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
||||||
|
|
||||||
def test_issue1242():
|
def test_issue1242():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
doc = nlp('')
|
doc = nlp("")
|
||||||
assert len(doc) == 0
|
assert len(doc) == 0
|
||||||
docs = list(nlp.pipe(['', 'hello']))
|
docs = list(nlp.pipe(["", "hello"]))
|
||||||
assert len(docs[0]) == 0
|
assert len(docs[0]) == 0
|
||||||
assert len(docs[1]) == 1
|
assert len(docs[1]) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_issue1250():
|
def test_issue1250():
|
||||||
"""Test cached special cases."""
|
"""Test cached special cases."""
|
||||||
special_case = [{ORTH: 'reimbur', LEMMA: 'reimburse', POS: 'VERB'}]
|
special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.tokenizer.add_special_case('reimbur', special_case)
|
nlp.tokenizer.add_special_case("reimbur", special_case)
|
||||||
lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')]
|
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
|
||||||
assert lemmas == ['reimburse', ',', 'reimburse', '...']
|
assert lemmas == ["reimburse", ",", "reimburse", "..."]
|
||||||
lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')]
|
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
|
||||||
assert lemmas == ['reimburse', ',', 'reimburse', '...']
|
assert lemmas == ["reimburse", ",", "reimburse", "..."]
|
||||||
|
|
||||||
|
|
||||||
def test_issue1257():
|
def test_issue1257():
|
||||||
"""Test that tokens compare correctly."""
|
"""Test that tokens compare correctly."""
|
||||||
doc1 = Doc(Vocab(), words=['a', 'b', 'c'])
|
doc1 = Doc(Vocab(), words=["a", "b", "c"])
|
||||||
doc2 = Doc(Vocab(), words=['a', 'c', 'e'])
|
doc2 = Doc(Vocab(), words=["a", "c", "e"])
|
||||||
assert doc1[0] != doc2[0]
|
assert doc1[0] != doc2[0]
|
||||||
assert not doc1[0] == doc2[0]
|
assert not doc1[0] == doc2[0]
|
||||||
|
|
||||||
|
|
||||||
def test_issue1375():
|
def test_issue1375():
|
||||||
"""Test that token.nbor() raises IndexError for out-of-bounds access."""
|
"""Test that token.nbor() raises IndexError for out-of-bounds access."""
|
||||||
doc = Doc(Vocab(), words=['0', '1', '2'])
|
doc = Doc(Vocab(), words=["0", "1", "2"])
|
||||||
with pytest.raises(IndexError):
|
with pytest.raises(IndexError):
|
||||||
assert doc[0].nbor(-1)
|
assert doc[0].nbor(-1)
|
||||||
assert doc[1].nbor(-1).text == '0'
|
assert doc[1].nbor(-1).text == "0"
|
||||||
with pytest.raises(IndexError):
|
with pytest.raises(IndexError):
|
||||||
assert doc[2].nbor(1)
|
assert doc[2].nbor(1)
|
||||||
assert doc[1].nbor(1).text == '2'
|
assert doc[1].nbor(1).text == "2"
|
||||||
|
|
||||||
|
|
||||||
def test_issue1387():
|
def test_issue1387():
|
||||||
tag_map = {'VBG': {POS: VERB, VerbForm_part: True}}
|
tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
|
||||||
index = {"verb": ("cope","cop")}
|
index = {"verb": ("cope", "cop")}
|
||||||
exc = {"verb": {"coping": ("cope",)}}
|
exc = {"verb": {"coping": ("cope",)}}
|
||||||
rules = {"verb": [["ing", ""]]}
|
rules = {"verb": [["ing", ""]]}
|
||||||
lemmatizer = Lemmatizer(index, exc, rules)
|
lemmatizer = Lemmatizer(index, exc, rules)
|
||||||
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
||||||
doc = Doc(vocab, words=["coping"])
|
doc = Doc(vocab, words=["coping"])
|
||||||
doc[0].tag_ = 'VBG'
|
doc[0].tag_ = "VBG"
|
||||||
assert doc[0].text == "coping"
|
assert doc[0].text == "coping"
|
||||||
assert doc[0].lemma_ == "cope"
|
assert doc[0].lemma_ == "cope"
|
||||||
|
|
||||||
|
|
||||||
def test_issue1434():
|
def test_issue1434():
|
||||||
"""Test matches occur when optional element at end of short doc."""
|
"""Test matches occur when optional element at end of short doc."""
|
||||||
pattern = [{'ORTH': 'Hello' }, {'IS_ALPHA': True, 'OP': '?'}]
|
pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
|
||||||
vocab = Vocab(lex_attr_getters=LEX_ATTRS)
|
vocab = Vocab(lex_attr_getters=LEX_ATTRS)
|
||||||
hello_world = Doc(vocab, words=['Hello', 'World'])
|
hello_world = Doc(vocab, words=["Hello", "World"])
|
||||||
hello = Doc(vocab, words=['Hello'])
|
hello = Doc(vocab, words=["Hello"])
|
||||||
matcher = Matcher(vocab)
|
matcher = Matcher(vocab)
|
||||||
matcher.add('MyMatcher', None, pattern)
|
matcher.add("MyMatcher", None, pattern)
|
||||||
matches = matcher(hello_world)
|
matches = matcher(hello_world)
|
||||||
assert matches
|
assert matches
|
||||||
matches = matcher(hello)
|
matches = matcher(hello)
|
||||||
assert matches
|
assert matches
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string,start,end', [
|
@pytest.mark.parametrize(
|
||||||
('a', 0, 1), ('a b', 0, 2), ('a c', 0, 1), ('a b c', 0, 2),
|
"string,start,end",
|
||||||
('a b b c', 0, 3), ('a b b', 0, 3),])
|
[
|
||||||
|
("a", 0, 1),
|
||||||
|
("a b", 0, 2),
|
||||||
|
("a c", 0, 1),
|
||||||
|
("a b c", 0, 2),
|
||||||
|
("a b b c", 0, 3),
|
||||||
|
("a b b", 0, 3),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_issue1450(string, start, end):
|
def test_issue1450(string, start, end):
|
||||||
"""Test matcher works when patterns end with * operator."""
|
"""Test matcher works when patterns end with * operator."""
|
||||||
pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]
|
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
|
||||||
matcher = Matcher(Vocab())
|
matcher = Matcher(Vocab())
|
||||||
matcher.add("TSTEND", None, pattern)
|
matcher.add("TSTEND", None, pattern)
|
||||||
doc = Doc(Vocab(), words=string.split())
|
doc = Doc(Vocab(), words=string.split())
|
||||||
|
@ -96,17 +104,20 @@ def test_issue1450(string, start, end):
|
||||||
|
|
||||||
|
|
||||||
def test_issue1488():
|
def test_issue1488():
|
||||||
prefix_re = re.compile(r'''[\[\("']''')
|
prefix_re = re.compile(r"""[\[\("']""")
|
||||||
suffix_re = re.compile(r'''[\]\)"']''')
|
suffix_re = re.compile(r"""[\]\)"']""")
|
||||||
infix_re = re.compile(r'''[-~\.]''')
|
infix_re = re.compile(r"""[-~\.]""")
|
||||||
simple_url_re = re.compile(r'''^https?://''')
|
simple_url_re = re.compile(r"""^https?://""")
|
||||||
|
|
||||||
def my_tokenizer(nlp):
|
def my_tokenizer(nlp):
|
||||||
return Tokenizer(nlp.vocab, {},
|
return Tokenizer(
|
||||||
prefix_search=prefix_re.search,
|
nlp.vocab,
|
||||||
suffix_search=suffix_re.search,
|
{},
|
||||||
infix_finditer=infix_re.finditer,
|
prefix_search=prefix_re.search,
|
||||||
token_match=simple_url_re.match)
|
suffix_search=suffix_re.search,
|
||||||
|
infix_finditer=infix_re.finditer,
|
||||||
|
token_match=simple_url_re.match,
|
||||||
|
)
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.tokenizer = my_tokenizer(nlp)
|
nlp.tokenizer = my_tokenizer(nlp)
|
||||||
|
@ -116,11 +127,16 @@ def test_issue1488():
|
||||||
|
|
||||||
|
|
||||||
def test_issue1494():
|
def test_issue1494():
|
||||||
infix_re = re.compile(r'''[^a-z]''')
|
infix_re = re.compile(r"""[^a-z]""")
|
||||||
test_cases = [('token 123test', ['token', '1', '2', '3', 'test']),
|
test_cases = [
|
||||||
('token 1test', ['token', '1test']),
|
("token 123test", ["token", "1", "2", "3", "test"]),
|
||||||
('hello...test', ['hello', '.', '.', '.', 'test'])]
|
("token 1test", ["token", "1test"]),
|
||||||
new_tokenizer = lambda nlp: Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
|
("hello...test", ["hello", ".", ".", ".", "test"]),
|
||||||
|
]
|
||||||
|
|
||||||
|
def new_tokenizer(nlp):
|
||||||
|
return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.tokenizer = new_tokenizer(nlp)
|
nlp.tokenizer = new_tokenizer(nlp)
|
||||||
for text, expected in test_cases:
|
for text, expected in test_cases:
|
||||||
|
|
|
@ -45,17 +45,17 @@ def test_issue1506():
|
||||||
def test_issue1518():
|
def test_issue1518():
|
||||||
"""Test vectors.resize() works."""
|
"""Test vectors.resize() works."""
|
||||||
vectors = Vectors(shape=(10, 10))
|
vectors = Vectors(shape=(10, 10))
|
||||||
vectors.add('hello', row=2)
|
vectors.add("hello", row=2)
|
||||||
vectors.resize((5, 9))
|
vectors.resize((5, 9))
|
||||||
|
|
||||||
|
|
||||||
def test_issue1537():
|
def test_issue1537():
|
||||||
"""Test that Span.as_doc() doesn't segfault."""
|
"""Test that Span.as_doc() doesn't segfault."""
|
||||||
string = 'The sky is blue . The man is pink . The dog is purple .'
|
string = "The sky is blue . The man is pink . The dog is purple ."
|
||||||
doc = Doc(Vocab(), words=string.split())
|
doc = Doc(Vocab(), words=string.split())
|
||||||
doc[0].sent_start = True
|
doc[0].sent_start = True
|
||||||
for word in doc[1:]:
|
for word in doc[1:]:
|
||||||
if word.nbor(-1).text == '.':
|
if word.nbor(-1).text == ".":
|
||||||
word.sent_start = True
|
word.sent_start = True
|
||||||
else:
|
else:
|
||||||
word.sent_start = False
|
word.sent_start = False
|
||||||
|
@ -67,7 +67,7 @@ def test_issue1537():
|
||||||
|
|
||||||
|
|
||||||
# TODO: Currently segfaulting, due to l_edge and r_edge misalignment
|
# TODO: Currently segfaulting, due to l_edge and r_edge misalignment
|
||||||
#def test_issue1537_model():
|
# def test_issue1537_model():
|
||||||
# nlp = load_spacy('en')
|
# nlp = load_spacy('en')
|
||||||
# doc = nlp('The sky is blue. The man is pink. The dog is purple.')
|
# doc = nlp('The sky is blue. The man is pink. The dog is purple.')
|
||||||
# sents = [s.as_doc() for s in doc.sents]
|
# sents = [s.as_doc() for s in doc.sents]
|
||||||
|
@ -77,41 +77,41 @@ def test_issue1537():
|
||||||
|
|
||||||
def test_issue1539():
|
def test_issue1539():
|
||||||
"""Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
|
"""Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
|
||||||
v = Vectors(shape=(10, 10), keys=[5,3,98,100])
|
v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
|
||||||
v.resize((100,100))
|
v.resize((100, 100))
|
||||||
|
|
||||||
|
|
||||||
def test_issue1547():
|
def test_issue1547():
|
||||||
"""Test that entity labels still match after merging tokens."""
|
"""Test that entity labels still match after merging tokens."""
|
||||||
words = ['\n', 'worda', '.', '\n', 'wordb', '-', 'Biosphere', '2', '-', ' \n']
|
words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
|
||||||
doc = Doc(Vocab(), words=words)
|
doc = Doc(Vocab(), words=words)
|
||||||
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings['PRODUCT'])]
|
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
|
||||||
doc[5:7].merge()
|
doc[5:7].merge()
|
||||||
assert [ent.text for ent in doc.ents]
|
assert [ent.text for ent in doc.ents]
|
||||||
|
|
||||||
|
|
||||||
def test_issue1612(en_tokenizer):
|
def test_issue1612(en_tokenizer):
|
||||||
doc = en_tokenizer('The black cat purrs.')
|
doc = en_tokenizer("The black cat purrs.")
|
||||||
span = doc[1: 3]
|
span = doc[1:3]
|
||||||
assert span.orth_ == span.text
|
assert span.orth_ == span.text
|
||||||
|
|
||||||
|
|
||||||
def test_issue1654():
|
def test_issue1654():
|
||||||
nlp = Language(Vocab())
|
nlp = Language(Vocab())
|
||||||
assert not nlp.pipeline
|
assert not nlp.pipeline
|
||||||
nlp.add_pipe(lambda doc: doc, name='1')
|
nlp.add_pipe(lambda doc: doc, name="1")
|
||||||
nlp.add_pipe(lambda doc: doc, name='2', after='1')
|
nlp.add_pipe(lambda doc: doc, name="2", after="1")
|
||||||
nlp.add_pipe(lambda doc: doc, name='3', after='2')
|
nlp.add_pipe(lambda doc: doc, name="3", after="2")
|
||||||
assert nlp.pipe_names == ['1', '2', '3']
|
assert nlp.pipe_names == ["1", "2", "3"]
|
||||||
nlp2 = Language(Vocab())
|
nlp2 = Language(Vocab())
|
||||||
assert not nlp2.pipeline
|
assert not nlp2.pipeline
|
||||||
nlp2.add_pipe(lambda doc: doc, name='3')
|
nlp2.add_pipe(lambda doc: doc, name="3")
|
||||||
nlp2.add_pipe(lambda doc: doc, name='2', before='3')
|
nlp2.add_pipe(lambda doc: doc, name="2", before="3")
|
||||||
nlp2.add_pipe(lambda doc: doc, name='1', before='2')
|
nlp2.add_pipe(lambda doc: doc, name="1", before="2")
|
||||||
assert nlp2.pipe_names == ['1', '2', '3']
|
assert nlp2.pipe_names == ["1", "2", "3"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ['test@example.com', 'john.doe@example.co.uk'])
|
@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"])
|
||||||
def test_issue1698(en_tokenizer, text):
|
def test_issue1698(en_tokenizer, text):
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert len(doc) == 1
|
assert len(doc) == 1
|
||||||
|
@ -121,30 +121,30 @@ def test_issue1698(en_tokenizer, text):
|
||||||
def test_issue1727():
|
def test_issue1727():
|
||||||
"""Test that models with no pretrained vectors can be deserialized
|
"""Test that models with no pretrained vectors can be deserialized
|
||||||
correctly after vectors are added."""
|
correctly after vectors are added."""
|
||||||
data = numpy.ones((3, 300), dtype='f')
|
data = numpy.ones((3, 300), dtype="f")
|
||||||
vectors = Vectors(data=data, keys=['I', 'am', 'Matt'])
|
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
|
||||||
tagger = Tagger(Vocab())
|
tagger = Tagger(Vocab())
|
||||||
tagger.add_label('PRP')
|
tagger.add_label("PRP")
|
||||||
tagger.begin_training()
|
tagger.begin_training()
|
||||||
assert tagger.cfg.get('pretrained_dims', 0) == 0
|
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
||||||
tagger.vocab.vectors = vectors
|
tagger.vocab.vectors = vectors
|
||||||
with make_tempdir() as path:
|
with make_tempdir() as path:
|
||||||
tagger.to_disk(path)
|
tagger.to_disk(path)
|
||||||
tagger = Tagger(Vocab()).from_disk(path)
|
tagger = Tagger(Vocab()).from_disk(path)
|
||||||
assert tagger.cfg.get('pretrained_dims', 0) == 0
|
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_issue1757():
|
def test_issue1757():
|
||||||
"""Test comparison against None doesn't cause segfault."""
|
"""Test comparison against None doesn't cause segfault."""
|
||||||
doc = Doc(Vocab(), words=['a', 'b', 'c'])
|
doc = Doc(Vocab(), words=["a", "b", "c"])
|
||||||
assert not doc[0] < None
|
assert not doc[0] < None
|
||||||
assert not doc[0] == None
|
assert not doc[0] == None
|
||||||
assert doc[0] >= None
|
assert doc[0] >= None
|
||||||
assert not doc[:2] < None
|
assert not doc[:2] < None
|
||||||
assert not doc[:2] == None
|
assert not doc[:2] == None
|
||||||
assert doc[:2] >= None
|
assert doc[:2] >= None
|
||||||
assert not doc.vocab['a'] == None
|
assert not doc.vocab["a"] == None
|
||||||
assert not doc.vocab['a'] < None
|
assert not doc.vocab["a"] < None
|
||||||
|
|
||||||
|
|
||||||
def test_issue1758(en_tokenizer):
|
def test_issue1758(en_tokenizer):
|
||||||
|
@ -158,11 +158,20 @@ def test_issue1758(en_tokenizer):
|
||||||
def test_issue1799():
|
def test_issue1799():
|
||||||
"""Test sentence boundaries are deserialized correctly, even for
|
"""Test sentence boundaries are deserialized correctly, even for
|
||||||
non-projective sentences."""
|
non-projective sentences."""
|
||||||
heads_deps = numpy.asarray([[1, 397], [4, 436], [2, 426], [1, 402],
|
heads_deps = numpy.asarray(
|
||||||
[0, 8206900633647566924], [18446744073709551615, 440],
|
[
|
||||||
[18446744073709551614, 442]], dtype='uint64')
|
[1, 397],
|
||||||
doc = Doc(Vocab(), words='Just what I was looking for .'.split())
|
[4, 436],
|
||||||
doc.vocab.strings.add('ROOT')
|
[2, 426],
|
||||||
|
[1, 402],
|
||||||
|
[0, 8206900633647566924],
|
||||||
|
[18446744073709551615, 440],
|
||||||
|
[18446744073709551614, 442],
|
||||||
|
],
|
||||||
|
dtype="uint64",
|
||||||
|
)
|
||||||
|
doc = Doc(Vocab(), words="Just what I was looking for .".split())
|
||||||
|
doc.vocab.strings.add("ROOT")
|
||||||
doc = doc.from_array([HEAD, DEP], heads_deps)
|
doc = doc.from_array([HEAD, DEP], heads_deps)
|
||||||
assert len(list(doc.sents)) == 1
|
assert len(list(doc.sents)) == 1
|
||||||
|
|
||||||
|
@ -170,9 +179,9 @@ def test_issue1799():
|
||||||
def test_issue1807():
|
def test_issue1807():
|
||||||
"""Test vocab.set_vector also adds the word to the vocab."""
|
"""Test vocab.set_vector also adds the word to the vocab."""
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
assert 'hello' not in vocab
|
assert "hello" not in vocab
|
||||||
vocab.set_vector('hello', numpy.ones((50,), dtype='f'))
|
vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
|
||||||
assert 'hello' in vocab
|
assert "hello" in vocab
|
||||||
|
|
||||||
|
|
||||||
def test_issue1834():
|
def test_issue1834():
|
||||||
|
@ -195,34 +204,34 @@ def test_issue1834():
|
||||||
def test_issue1868():
|
def test_issue1868():
|
||||||
"""Test Vocab.__contains__ works with int keys."""
|
"""Test Vocab.__contains__ works with int keys."""
|
||||||
vocab = Vocab()
|
vocab = Vocab()
|
||||||
lex = vocab['hello']
|
lex = vocab["hello"]
|
||||||
assert lex.orth in vocab
|
assert lex.orth in vocab
|
||||||
assert lex.orth_ in vocab
|
assert lex.orth_ in vocab
|
||||||
assert 'some string' not in vocab
|
assert "some string" not in vocab
|
||||||
int_id = vocab.strings.add('some string')
|
int_id = vocab.strings.add("some string")
|
||||||
assert int_id not in vocab
|
assert int_id not in vocab
|
||||||
|
|
||||||
|
|
||||||
def test_issue1883():
|
def test_issue1883():
|
||||||
matcher = Matcher(Vocab())
|
matcher = Matcher(Vocab())
|
||||||
matcher.add('pat1', None, [{'orth': 'hello'}])
|
matcher.add("pat1", None, [{"orth": "hello"}])
|
||||||
doc = Doc(matcher.vocab, words=['hello'])
|
doc = Doc(matcher.vocab, words=["hello"])
|
||||||
assert len(matcher(doc)) == 1
|
assert len(matcher(doc)) == 1
|
||||||
new_matcher = copy.deepcopy(matcher)
|
new_matcher = copy.deepcopy(matcher)
|
||||||
new_doc = Doc(new_matcher.vocab, words=['hello'])
|
new_doc = Doc(new_matcher.vocab, words=["hello"])
|
||||||
assert len(new_matcher(new_doc)) == 1
|
assert len(new_matcher(new_doc)) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('word', ['the'])
|
@pytest.mark.parametrize("word", ["the"])
|
||||||
def test_issue1889(word):
|
def test_issue1889(word):
|
||||||
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
|
||||||
|
|
||||||
|
|
||||||
def test_issue1915():
|
def test_issue1915():
|
||||||
cfg = {'hidden_depth': 2} # should error out
|
cfg = {"hidden_depth": 2} # should error out
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.add_pipe(nlp.create_pipe('ner'))
|
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||||
nlp.get_pipe('ner').add_label('answer')
|
nlp.get_pipe("ner").add_label("answer")
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.begin_training(**cfg)
|
nlp.begin_training(**cfg)
|
||||||
|
|
||||||
|
@ -230,17 +239,17 @@ def test_issue1915():
|
||||||
def test_issue1945():
|
def test_issue1945():
|
||||||
"""Test regression in Matcher introduced in v2.0.6."""
|
"""Test regression in Matcher introduced in v2.0.6."""
|
||||||
matcher = Matcher(Vocab())
|
matcher = Matcher(Vocab())
|
||||||
matcher.add('MWE', None, [{'orth': 'a'}, {'orth': 'a'}])
|
matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}])
|
||||||
doc = Doc(matcher.vocab, words=['a', 'a', 'a'])
|
doc = Doc(matcher.vocab, words=["a", "a", "a"])
|
||||||
matches = matcher(doc) # we should see two overlapping matches here
|
matches = matcher(doc) # we should see two overlapping matches here
|
||||||
assert len(matches) == 2
|
assert len(matches) == 2
|
||||||
assert matches[0][1:] == (0, 2)
|
assert matches[0][1:] == (0, 2)
|
||||||
assert matches[1][1:] == (1, 3)
|
assert matches[1][1:] == (1, 3)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('label', ['U-JOB-NAME'])
|
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
||||||
def test_issue1967(label):
|
def test_issue1967(label):
|
||||||
ner = EntityRecognizer(Vocab())
|
ner = EntityRecognizer(Vocab())
|
||||||
entry = ([0], ['word'], ['tag'], [0], ['dep'], [label])
|
entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
|
||||||
gold_parses = [(None, [(entry, None)])]
|
gold_parses = [(None, [(entry, None)])]
|
||||||
ner.moves.get_actions(gold_parses=gold_parses)
|
ner.moves.get_actions(gold_parses=gold_parses)
|
||||||
|
|
|
@ -14,15 +14,15 @@ from ..util import add_vecs_to_vocab
|
||||||
def test_issue2179():
|
def test_issue2179():
|
||||||
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
ner = nlp.create_pipe('ner')
|
ner = nlp.create_pipe("ner")
|
||||||
ner.add_label('CITIZENSHIP')
|
ner.add_label("CITIZENSHIP")
|
||||||
nlp.add_pipe(ner)
|
nlp.add_pipe(ner)
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
nlp2 = Italian()
|
nlp2 = Italian()
|
||||||
nlp2.add_pipe(nlp2.create_pipe('ner'))
|
nlp2.add_pipe(nlp2.create_pipe("ner"))
|
||||||
nlp2.from_bytes(nlp.to_bytes())
|
nlp2.from_bytes(nlp.to_bytes())
|
||||||
assert 'extra_labels' not in nlp2.get_pipe('ner').cfg
|
assert "extra_labels" not in nlp2.get_pipe("ner").cfg
|
||||||
assert nlp2.get_pipe('ner').labels == ['CITIZENSHIP']
|
assert nlp2.get_pipe("ner").labels == ["CITIZENSHIP"]
|
||||||
|
|
||||||
|
|
||||||
def test_issue2219(en_vocab):
|
def test_issue2219(en_vocab):
|
||||||
|
@ -34,7 +34,7 @@ def test_issue2219(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_issue2361(de_tokenizer):
|
def test_issue2361(de_tokenizer):
|
||||||
chars = ('<', '>', '&', '"')
|
chars = ("<", ">", "&", """)
|
||||||
doc = de_tokenizer('< > & " ')
|
doc = de_tokenizer('< > & " ')
|
||||||
doc.is_parsed = True
|
doc.is_parsed = True
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
@ -46,25 +46,32 @@ def test_issue2361(de_tokenizer):
|
||||||
def test_issue2385():
|
def test_issue2385():
|
||||||
"""Test that IOB tags are correctly converted to BILUO tags."""
|
"""Test that IOB tags are correctly converted to BILUO tags."""
|
||||||
# fix bug in labels with a 'b' character
|
# fix bug in labels with a 'b' character
|
||||||
tags1 = ('B-BRAWLER', 'I-BRAWLER', 'I-BRAWLER')
|
tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER")
|
||||||
assert iob_to_biluo(tags1) == ['B-BRAWLER', 'I-BRAWLER', 'L-BRAWLER']
|
assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"]
|
||||||
# maintain support for iob1 format
|
# maintain support for iob1 format
|
||||||
tags2 = ('I-ORG', 'I-ORG', 'B-ORG')
|
tags2 = ("I-ORG", "I-ORG", "B-ORG")
|
||||||
assert iob_to_biluo(tags2) == ['B-ORG', 'L-ORG', 'U-ORG']
|
assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"]
|
||||||
# maintain support for iob2 format
|
# maintain support for iob2 format
|
||||||
tags3 = ('B-PERSON', 'I-PERSON', 'B-PERSON')
|
tags3 = ("B-PERSON", "I-PERSON", "B-PERSON")
|
||||||
assert iob_to_biluo(tags3) ==['B-PERSON', 'L-PERSON', 'U-PERSON']
|
assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('tags', [
|
@pytest.mark.parametrize(
|
||||||
('B-ORG', 'L-ORG'), ('B-PERSON', 'I-PERSON', 'L-PERSON'), ('U-BRAWLER', 'U-BRAWLER')])
|
"tags",
|
||||||
|
[
|
||||||
|
("B-ORG", "L-ORG"),
|
||||||
|
("B-PERSON", "I-PERSON", "L-PERSON"),
|
||||||
|
("U-BRAWLER", "U-BRAWLER"),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_issue2385_biluo(tags):
|
def test_issue2385_biluo(tags):
|
||||||
"""Test that BILUO-compatible tags aren't modified."""
|
"""Test that BILUO-compatible tags aren't modified."""
|
||||||
assert iob_to_biluo(tags) == list(tags)
|
assert iob_to_biluo(tags) == list(tags)
|
||||||
|
|
||||||
|
|
||||||
def test_issue2482():
|
def test_issue2482():
|
||||||
'''Test we can serialize and deserialize a blank NER or parser model.'''
|
"""Test we can serialize and deserialize a blank NER or parser model."""
|
||||||
nlp = Italian()
|
nlp = Italian()
|
||||||
nlp.add_pipe(nlp.create_pipe('ner'))
|
nlp.add_pipe(nlp.create_pipe("ner"))
|
||||||
b = nlp.to_bytes()
|
b = nlp.to_bytes()
|
||||||
nlp2 = Italian().from_bytes(b)
|
nlp2 = Italian().from_bytes(b)
|
||||||
|
|
|
@ -7,11 +7,11 @@ from spacy.language import Language
|
||||||
def test_issue2564():
|
def test_issue2564():
|
||||||
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
|
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
tagger = nlp.create_pipe('tagger')
|
tagger = nlp.create_pipe("tagger")
|
||||||
tagger.begin_training() # initialise weights
|
tagger.begin_training() # initialise weights
|
||||||
nlp.add_pipe(tagger)
|
nlp.add_pipe(tagger)
|
||||||
doc = nlp('hello world')
|
doc = nlp("hello world")
|
||||||
assert doc.is_tagged
|
assert doc.is_tagged
|
||||||
docs = nlp.pipe(['hello', 'world'])
|
docs = nlp.pipe(["hello", "world"])
|
||||||
piped_doc = next(docs)
|
piped_doc = next(docs)
|
||||||
assert piped_doc.is_tagged
|
assert piped_doc.is_tagged
|
||||||
|
|
|
@ -7,11 +7,11 @@ from spacy.tokens import Span
|
||||||
|
|
||||||
def test_issue2569(en_tokenizer):
|
def test_issue2569(en_tokenizer):
|
||||||
doc = en_tokenizer("It is May 15, 1993.")
|
doc = en_tokenizer("It is May 15, 1993.")
|
||||||
doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings['DATE'])]
|
doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
|
||||||
matcher = Matcher(doc.vocab)
|
matcher = Matcher(doc.vocab)
|
||||||
matcher.add("RULE", None, [{'ENT_TYPE':'DATE', 'OP':'+'}])
|
matcher.add("RULE", None, [{"ENT_TYPE": "DATE", "OP": "+"}])
|
||||||
matched = [doc[start:end] for _, start, end in matcher(doc)]
|
matched = [doc[start:end] for _, start, end in matcher(doc)]
|
||||||
matched = sorted(matched, key=len, reverse=True)
|
matched = sorted(matched, key=len, reverse=True)
|
||||||
assert len(matched) == 10
|
assert len(matched) == 10
|
||||||
assert len(matched[0]) == 4
|
assert len(matched[0]) == 4
|
||||||
assert matched[0].text == 'May 15, 1993'
|
assert matched[0].text == "May 15, 1993"
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
|
@ -10,6 +9,7 @@ def test_issue2671():
|
||||||
"""Ensure the correct entity ID is returned for matches with quantifiers.
|
"""Ensure the correct entity ID is returned for matches with quantifiers.
|
||||||
See also #2675
|
See also #2675
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def get_rule_id(nlp, matcher, doc):
|
def get_rule_id(nlp, matcher, doc):
|
||||||
matches = matcher(doc)
|
matches = matcher(doc)
|
||||||
for match_id, start, end in matches:
|
for match_id, start, end in matches:
|
||||||
|
@ -19,10 +19,12 @@ def test_issue2671():
|
||||||
|
|
||||||
nlp = English()
|
nlp = English()
|
||||||
matcher = Matcher(nlp.vocab)
|
matcher = Matcher(nlp.vocab)
|
||||||
pattern_id = 'test_pattern'
|
pattern_id = "test_pattern"
|
||||||
pattern = [{'LOWER': 'high'},
|
pattern = [
|
||||||
{'IS_PUNCT': True, 'OP': '?'},
|
{"LOWER": "high"},
|
||||||
{'LOWER': 'adrenaline'}]
|
{"IS_PUNCT": True, "OP": "?"},
|
||||||
|
{"LOWER": "adrenaline"},
|
||||||
|
]
|
||||||
matcher.add(pattern_id, None, pattern)
|
matcher.add(pattern_id, None, pattern)
|
||||||
doc1 = nlp("This is a high-adrenaline situation.")
|
doc1 = nlp("This is a high-adrenaline situation.")
|
||||||
doc2 = nlp("This is a high adrenaline situation.")
|
doc2 = nlp("This is a high adrenaline situation.")
|
||||||
|
|
|
@ -1,17 +1,15 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from ..util import get_doc
|
from ..util import get_doc
|
||||||
|
|
||||||
|
|
||||||
def test_issue2772(en_vocab):
|
def test_issue2772(en_vocab):
|
||||||
"""Test that deprojectivization doesn't mess up sentence boundaries."""
|
"""Test that deprojectivization doesn't mess up sentence boundaries."""
|
||||||
words = 'When we write or communicate virtually , we can hide our true feelings .'.split()
|
words = "When we write or communicate virtually , we can hide our true feelings .".split()
|
||||||
# A tree with a non-projective (i.e. crossing) arc
|
# A tree with a non-projective (i.e. crossing) arc
|
||||||
# The arcs (0, 4) and (2, 9) cross.
|
# The arcs (0, 4) and (2, 9) cross.
|
||||||
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
|
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
|
||||||
deps = ['dep'] * len(heads)
|
deps = ["dep"] * len(heads)
|
||||||
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
|
||||||
assert doc[1].is_sent_start is None
|
assert doc[1].is_sent_start is None
|
||||||
|
|
|
@ -5,8 +5,8 @@ from spacy.util import get_lang_class
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ['-0.23', '+123,456', '±1'])
|
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
|
||||||
@pytest.mark.parametrize('lang', ['en', 'xx'])
|
@pytest.mark.parametrize("lang", ["en", "xx"])
|
||||||
def test_issue2782(text, lang):
|
def test_issue2782(text, lang):
|
||||||
"""Check that like_num handles + and - before number."""
|
"""Check that like_num handles + and - before number."""
|
||||||
cls = get_lang_class(lang)
|
cls = get_lang_class(lang)
|
||||||
|
|
|
@ -18,25 +18,25 @@ def test_serialize_empty_doc(en_vocab):
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_doc_roundtrip_bytes(en_vocab):
|
def test_serialize_doc_roundtrip_bytes(en_vocab):
|
||||||
doc = Doc(en_vocab, words=['hello', 'world'])
|
doc = Doc(en_vocab, words=["hello", "world"])
|
||||||
doc_b = doc.to_bytes()
|
doc_b = doc.to_bytes()
|
||||||
new_doc = Doc(en_vocab).from_bytes(doc_b)
|
new_doc = Doc(en_vocab).from_bytes(doc_b)
|
||||||
assert new_doc.to_bytes() == doc_b
|
assert new_doc.to_bytes() == doc_b
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_doc_roundtrip_disk(en_vocab):
|
def test_serialize_doc_roundtrip_disk(en_vocab):
|
||||||
doc = Doc(en_vocab, words=['hello', 'world'])
|
doc = Doc(en_vocab, words=["hello", "world"])
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / 'doc'
|
file_path = d / "doc"
|
||||||
doc.to_disk(file_path)
|
doc.to_disk(file_path)
|
||||||
doc_d = Doc(en_vocab).from_disk(file_path)
|
doc_d = Doc(en_vocab).from_disk(file_path)
|
||||||
assert doc.to_bytes() == doc_d.to_bytes()
|
assert doc.to_bytes() == doc_d.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
|
def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
|
||||||
doc = Doc(en_vocab, words=['hello', 'world'])
|
doc = Doc(en_vocab, words=["hello", "world"])
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / 'doc'
|
file_path = d / "doc"
|
||||||
file_path = path2str(file_path)
|
file_path = path2str(file_path)
|
||||||
doc.to_disk(file_path)
|
doc.to_disk(file_path)
|
||||||
doc_d = Doc(en_vocab).from_disk(file_path)
|
doc_d = Doc(en_vocab).from_disk(file_path)
|
||||||
|
|
|
@ -8,19 +8,20 @@ from spacy.vocab import Vocab
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def doc_w_attrs(en_tokenizer):
|
def doc_w_attrs(en_tokenizer):
|
||||||
Doc.set_extension('_test_attr', default=False)
|
Doc.set_extension("_test_attr", default=False)
|
||||||
Doc.set_extension('_test_prop', getter=lambda doc: len(doc.text))
|
Doc.set_extension("_test_prop", getter=lambda doc: len(doc.text))
|
||||||
Doc.set_extension('_test_method', method=lambda doc, arg: "{}{}".format(len(doc.text), arg))
|
Doc.set_extension(
|
||||||
|
"_test_method", method=lambda doc, arg: "{}{}".format(len(doc.text), arg)
|
||||||
|
)
|
||||||
doc = en_tokenizer("This is a test.")
|
doc = en_tokenizer("This is a test.")
|
||||||
doc._._test_attr = 'test'
|
doc._._test_attr = "test"
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
|
def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
|
||||||
doc_b = doc_w_attrs.to_bytes()
|
doc_b = doc_w_attrs.to_bytes()
|
||||||
doc = Doc(Vocab()).from_bytes(doc_b)
|
doc = Doc(Vocab()).from_bytes(doc_b)
|
||||||
assert doc._.has('_test_attr')
|
assert doc._.has("_test_attr")
|
||||||
assert doc._._test_attr == 'test'
|
assert doc._._test_attr == "test"
|
||||||
assert doc._._test_prop == len(doc.text)
|
assert doc._._test_prop == len(doc.text)
|
||||||
assert doc._._test_method('test') == '{}{}'.format(len(doc.text), 'test')
|
assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test")
|
||||||
|
|
|
@ -12,14 +12,14 @@ from ..util import make_tempdir
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def meta_data():
|
def meta_data():
|
||||||
return {
|
return {
|
||||||
'name': 'name-in-fixture',
|
"name": "name-in-fixture",
|
||||||
'version': 'version-in-fixture',
|
"version": "version-in-fixture",
|
||||||
'description': 'description-in-fixture',
|
"description": "description-in-fixture",
|
||||||
'author': 'author-in-fixture',
|
"author": "author-in-fixture",
|
||||||
'email': 'email-in-fixture',
|
"email": "email-in-fixture",
|
||||||
'url': 'url-in-fixture',
|
"url": "url-in-fixture",
|
||||||
'license': 'license-in-fixture',
|
"license": "license-in-fixture",
|
||||||
'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None}
|
"vectors": {"width": 0, "vectors": 0, "keys": 0, "name": None},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -35,16 +35,18 @@ def test_serialize_with_custom_tokenizer():
|
||||||
"""Test that serialization with custom tokenizer works without token_match.
|
"""Test that serialization with custom tokenizer works without token_match.
|
||||||
See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
|
See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
|
||||||
"""
|
"""
|
||||||
prefix_re = re.compile(r'''1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:''')
|
prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""")
|
||||||
suffix_re = re.compile(r'''''')
|
suffix_re = re.compile(r"""""")
|
||||||
infix_re = re.compile(r'''[~]''')
|
infix_re = re.compile(r"""[~]""")
|
||||||
|
|
||||||
def custom_tokenizer(nlp):
|
def custom_tokenizer(nlp):
|
||||||
return Tokenizer(nlp.vocab,
|
return Tokenizer(
|
||||||
{},
|
nlp.vocab,
|
||||||
prefix_search=prefix_re.search,
|
{},
|
||||||
suffix_search=suffix_re.search,
|
prefix_search=prefix_re.search,
|
||||||
infix_finditer=infix_re.finditer)
|
suffix_search=suffix_re.search,
|
||||||
|
infix_finditer=infix_re.finditer,
|
||||||
|
)
|
||||||
|
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
nlp.tokenizer = custom_tokenizer(nlp)
|
nlp.tokenizer = custom_tokenizer(nlp)
|
||||||
|
|
|
@ -2,7 +2,8 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer, Tensorizer, TextCategorizer
|
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
|
||||||
|
from spacy.pipeline import Tensorizer, TextCategorizer
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
@ -13,7 +14,7 @@ test_parsers = [DependencyParser, EntityRecognizer]
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def parser(en_vocab):
|
def parser(en_vocab):
|
||||||
parser = DependencyParser(en_vocab)
|
parser = DependencyParser(en_vocab)
|
||||||
parser.add_label('nsubj')
|
parser.add_label("nsubj")
|
||||||
parser.model, cfg = parser.Model(parser.moves.n_moves)
|
parser.model, cfg = parser.Model(parser.moves.n_moves)
|
||||||
parser.cfg.update(cfg)
|
parser.cfg.update(cfg)
|
||||||
return parser
|
return parser
|
||||||
|
@ -34,7 +35,7 @@ def taggers(en_vocab):
|
||||||
return (tagger1, tagger2)
|
return (tagger1, tagger2)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('Parser', test_parsers)
|
@pytest.mark.parametrize("Parser", test_parsers)
|
||||||
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
||||||
parser = Parser(en_vocab)
|
parser = Parser(en_vocab)
|
||||||
parser.model, _ = parser.Model(10)
|
parser.model, _ = parser.Model(10)
|
||||||
|
@ -44,12 +45,12 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
||||||
assert new_parser.to_bytes() == parser.to_bytes()
|
assert new_parser.to_bytes() == parser.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('Parser', test_parsers)
|
@pytest.mark.parametrize("Parser", test_parsers)
|
||||||
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
||||||
parser = Parser(en_vocab)
|
parser = Parser(en_vocab)
|
||||||
parser.model, _ = parser.Model(0)
|
parser.model, _ = parser.Model(0)
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / 'parser'
|
file_path = d / "parser"
|
||||||
parser.to_disk(file_path)
|
parser.to_disk(file_path)
|
||||||
parser_d = Parser(en_vocab)
|
parser_d = Parser(en_vocab)
|
||||||
parser_d.model, _ = parser_d.Model(0)
|
parser_d.model, _ = parser_d.Model(0)
|
||||||
|
@ -67,7 +68,9 @@ def test_to_from_bytes(parser, blank_parser):
|
||||||
assert blank_parser.moves.n_moves == parser.moves.n_moves
|
assert blank_parser.moves.n_moves == parser.moves.n_moves
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="This seems to be a dict ordering bug somewhere. Only failing on some platforms.")
|
@pytest.mark.skip(
|
||||||
|
reason="This seems to be a dict ordering bug somewhere. Only failing on some platforms."
|
||||||
|
)
|
||||||
def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
|
def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
|
||||||
tagger1, tagger2 = taggers
|
tagger1, tagger2 = taggers
|
||||||
tagger1_b = tagger1.to_bytes()
|
tagger1_b = tagger1.to_bytes()
|
||||||
|
@ -81,8 +84,8 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
|
||||||
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
||||||
tagger1, tagger2 = taggers
|
tagger1, tagger2 = taggers
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path1 = d / 'tagger1'
|
file_path1 = d / "tagger1"
|
||||||
file_path2 = d / 'tagger2'
|
file_path2 = d / "tagger2"
|
||||||
tagger1.to_disk(file_path1)
|
tagger1.to_disk(file_path1)
|
||||||
tagger2.to_disk(file_path2)
|
tagger2.to_disk(file_path2)
|
||||||
tagger1_d = Tagger(en_vocab).from_disk(file_path1)
|
tagger1_d = Tagger(en_vocab).from_disk(file_path1)
|
||||||
|
@ -102,7 +105,7 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
||||||
tensorizer = Tensorizer(en_vocab)
|
tensorizer = Tensorizer(en_vocab)
|
||||||
tensorizer.model = tensorizer.Model()
|
tensorizer.model = tensorizer.Model()
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / 'tensorizer'
|
file_path = d / "tensorizer"
|
||||||
tensorizer.to_disk(file_path)
|
tensorizer.to_disk(file_path)
|
||||||
tensorizer_d = Tensorizer(en_vocab).from_disk(file_path)
|
tensorizer_d = Tensorizer(en_vocab).from_disk(file_path)
|
||||||
assert tensorizer.to_bytes() == tensorizer_d.to_bytes()
|
assert tensorizer.to_bytes() == tensorizer_d.to_bytes()
|
||||||
|
@ -110,5 +113,5 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
||||||
|
|
||||||
def test_serialize_textcat_empty(en_vocab):
|
def test_serialize_textcat_empty(en_vocab):
|
||||||
# See issue #1105
|
# See issue #1105
|
||||||
textcat = TextCategorizer(en_vocab, labels=['ENTITY', 'ACTION', 'MODIFIER'])
|
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
|
||||||
textcat_bytes = textcat.to_bytes()
|
textcat_bytes = textcat.to_bytes()
|
||||||
|
|
|
@ -9,7 +9,7 @@ from ..util import make_tempdir, assert_packed_msg_equal
|
||||||
|
|
||||||
|
|
||||||
def load_tokenizer(b):
|
def load_tokenizer(b):
|
||||||
tok = get_lang_class('en').Defaults.create_tokenizer()
|
tok = get_lang_class("en").Defaults.create_tokenizer()
|
||||||
tok.from_bytes(b)
|
tok.from_bytes(b)
|
||||||
return tok
|
return tok
|
||||||
|
|
||||||
|
@ -23,7 +23,7 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Currently unreliable across platforms")
|
@pytest.mark.skip(reason="Currently unreliable across platforms")
|
||||||
@pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"])
|
@pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"])
|
||||||
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
||||||
tokenizer = en_tokenizer
|
tokenizer = en_tokenizer
|
||||||
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
|
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
|
||||||
|
@ -38,7 +38,7 @@ def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
|
||||||
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
|
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
|
||||||
tokenizer = en_tokenizer
|
tokenizer = en_tokenizer
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / 'tokenizer'
|
file_path = d / "tokenizer"
|
||||||
tokenizer.to_disk(file_path)
|
tokenizer.to_disk(file_path)
|
||||||
tokenizer_d = en_tokenizer.from_disk(file_path)
|
tokenizer_d = en_tokenizer.from_disk(file_path)
|
||||||
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()
|
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()
|
||||||
|
|
|
@ -8,12 +8,12 @@ from spacy.strings import StringStore
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
test_strings = [([], []), (['rats', 'are', 'cute'], ['i', 'like', 'rats'])]
|
test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
|
||||||
test_strings_attrs = [(['rats', 'are', 'cute'], 'Hello')]
|
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize('text', ['rat'])
|
@pytest.mark.parametrize("text", ["rat"])
|
||||||
def test_serialize_vocab(en_vocab, text):
|
def test_serialize_vocab(en_vocab, text):
|
||||||
text_hash = en_vocab.strings.add(text)
|
text_hash = en_vocab.strings.add(text)
|
||||||
vocab_bytes = en_vocab.to_bytes()
|
vocab_bytes = en_vocab.to_bytes()
|
||||||
|
@ -21,7 +21,7 @@ def test_serialize_vocab(en_vocab, text):
|
||||||
assert new_vocab.strings(text_hash) == text
|
assert new_vocab.strings(text_hash) == text
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('strings1,strings2', test_strings)
|
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||||
def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
||||||
vocab1 = Vocab(strings=strings1)
|
vocab1 = Vocab(strings=strings1)
|
||||||
vocab2 = Vocab(strings=strings2)
|
vocab2 = Vocab(strings=strings2)
|
||||||
|
@ -39,13 +39,13 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
||||||
assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
|
assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('strings1,strings2', test_strings)
|
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||||
def test_serialize_vocab_roundtrip_disk(strings1,strings2):
|
def test_serialize_vocab_roundtrip_disk(strings1, strings2):
|
||||||
vocab1 = Vocab(strings=strings1)
|
vocab1 = Vocab(strings=strings1)
|
||||||
vocab2 = Vocab(strings=strings2)
|
vocab2 = Vocab(strings=strings2)
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path1 = d / 'vocab1'
|
file_path1 = d / "vocab1"
|
||||||
file_path2 = d / 'vocab2'
|
file_path2 = d / "vocab2"
|
||||||
vocab1.to_disk(file_path1)
|
vocab1.to_disk(file_path1)
|
||||||
vocab2.to_disk(file_path2)
|
vocab2.to_disk(file_path2)
|
||||||
vocab1_d = Vocab().from_disk(file_path1)
|
vocab1_d = Vocab().from_disk(file_path1)
|
||||||
|
@ -58,7 +58,7 @@ def test_serialize_vocab_roundtrip_disk(strings1,strings2):
|
||||||
assert list(vocab1_d) != list(vocab2_d)
|
assert list(vocab1_d) != list(vocab2_d)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs)
|
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||||
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
|
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
|
||||||
vocab1 = Vocab(strings=strings)
|
vocab1 = Vocab(strings=strings)
|
||||||
vocab2 = Vocab()
|
vocab2 = Vocab()
|
||||||
|
@ -69,7 +69,7 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
|
||||||
assert vocab2[strings[0]].norm_ == lex_attr
|
assert vocab2[strings[0]].norm_ == lex_attr
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs)
|
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
||||||
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
|
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
|
||||||
vocab1 = Vocab(strings=strings)
|
vocab1 = Vocab(strings=strings)
|
||||||
vocab2 = Vocab()
|
vocab2 = Vocab()
|
||||||
|
@ -77,13 +77,13 @@ def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
|
||||||
assert vocab1[strings[0]].norm_ == lex_attr
|
assert vocab1[strings[0]].norm_ == lex_attr
|
||||||
assert vocab2[strings[0]].norm_ != lex_attr
|
assert vocab2[strings[0]].norm_ != lex_attr
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path = d / 'vocab'
|
file_path = d / "vocab"
|
||||||
vocab1.to_disk(file_path)
|
vocab1.to_disk(file_path)
|
||||||
vocab2 = vocab2.from_disk(file_path)
|
vocab2 = vocab2.from_disk(file_path)
|
||||||
assert vocab2[strings[0]].norm_ == lex_attr
|
assert vocab2[strings[0]].norm_ == lex_attr
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('strings1,strings2', test_strings)
|
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||||
def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
|
def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
|
||||||
sstore1 = StringStore(strings=strings1)
|
sstore1 = StringStore(strings=strings1)
|
||||||
sstore2 = StringStore(strings=strings2)
|
sstore2 = StringStore(strings=strings2)
|
||||||
|
@ -100,13 +100,13 @@ def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
|
||||||
assert list(new_sstore1) == strings1
|
assert list(new_sstore1) == strings1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('strings1,strings2', test_strings)
|
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||||
def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
|
def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
|
||||||
sstore1 = StringStore(strings=strings1)
|
sstore1 = StringStore(strings=strings1)
|
||||||
sstore2 = StringStore(strings=strings2)
|
sstore2 = StringStore(strings=strings2)
|
||||||
with make_tempdir() as d:
|
with make_tempdir() as d:
|
||||||
file_path1 = d / 'strings1'
|
file_path1 = d / "strings1"
|
||||||
file_path2 = d / 'strings2'
|
file_path2 = d / "strings2"
|
||||||
sstore1.to_disk(file_path1)
|
sstore1.to_disk(file_path1)
|
||||||
sstore2.to_disk(file_path2)
|
sstore2.to_disk(file_path2)
|
||||||
sstore1_d = StringStore().from_disk(file_path1)
|
sstore1_d = StringStore().from_disk(file_path1)
|
||||||
|
|
|
@ -5,52 +5,63 @@ import pytest
|
||||||
from spacy._align import align, multi_align
|
from spacy._align import align, multi_align
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string1,string2,cost', [
|
@pytest.mark.parametrize(
|
||||||
('hello', 'hell', 1),
|
"string1,string2,cost",
|
||||||
('rat', 'cat', 1),
|
[
|
||||||
('rat', 'rat', 0),
|
("hello", "hell", 1),
|
||||||
('rat', 'catsie', 4),
|
("rat", "cat", 1),
|
||||||
('t', 'catsie', 5),
|
("rat", "rat", 0),
|
||||||
])
|
("rat", "catsie", 4),
|
||||||
|
("t", "catsie", 5),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_align_costs(string1, string2, cost):
|
def test_align_costs(string1, string2, cost):
|
||||||
output_cost, i2j, j2i, matrix = align(string1, string2)
|
output_cost, i2j, j2i, matrix = align(string1, string2)
|
||||||
assert output_cost == cost
|
assert output_cost == cost
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string1,string2,i2j', [
|
@pytest.mark.parametrize(
|
||||||
('hello', 'hell', [0,1,2,3,-1]),
|
"string1,string2,i2j",
|
||||||
('rat', 'cat', [0,1,2]),
|
[
|
||||||
('rat', 'rat', [0,1,2]),
|
("hello", "hell", [0, 1, 2, 3, -1]),
|
||||||
('rat', 'catsie', [0,1,2]),
|
("rat", "cat", [0, 1, 2]),
|
||||||
('t', 'catsie', [2]),
|
("rat", "rat", [0, 1, 2]),
|
||||||
])
|
("rat", "catsie", [0, 1, 2]),
|
||||||
|
("t", "catsie", [2]),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_align_i2j(string1, string2, i2j):
|
def test_align_i2j(string1, string2, i2j):
|
||||||
output_cost, output_i2j, j2i, matrix = align(string1, string2)
|
output_cost, output_i2j, j2i, matrix = align(string1, string2)
|
||||||
assert list(output_i2j) == i2j
|
assert list(output_i2j) == i2j
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('string1,string2,j2i', [
|
@pytest.mark.parametrize(
|
||||||
('hello', 'hell', [0,1,2,3]),
|
"string1,string2,j2i",
|
||||||
('rat', 'cat', [0,1,2]),
|
[
|
||||||
('rat', 'rat', [0,1,2]),
|
("hello", "hell", [0, 1, 2, 3]),
|
||||||
('rat', 'catsie', [0,1,2, -1, -1, -1]),
|
("rat", "cat", [0, 1, 2]),
|
||||||
('t', 'catsie', [-1, -1, 0, -1, -1, -1]),
|
("rat", "rat", [0, 1, 2]),
|
||||||
])
|
("rat", "catsie", [0, 1, 2, -1, -1, -1]),
|
||||||
|
("t", "catsie", [-1, -1, 0, -1, -1, -1]),
|
||||||
|
],
|
||||||
|
)
|
||||||
def test_align_i2j(string1, string2, j2i):
|
def test_align_i2j(string1, string2, j2i):
|
||||||
output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
|
output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
|
||||||
assert list(output_j2i) == j2i
|
assert list(output_j2i) == j2i
|
||||||
|
|
||||||
|
|
||||||
def test_align_strings():
|
def test_align_strings():
|
||||||
words1 = ['hello', 'this', 'is', 'test!']
|
words1 = ["hello", "this", "is", "test!"]
|
||||||
words2 = ['hellothis', 'is', 'test', '!']
|
words2 = ["hellothis", "is", "test", "!"]
|
||||||
cost, i2j, j2i, matrix = align(words1, words2)
|
cost, i2j, j2i, matrix = align(words1, words2)
|
||||||
assert cost == 4
|
assert cost == 4
|
||||||
assert list(i2j) == [-1, -1, 1, -1]
|
assert list(i2j) == [-1, -1, 1, -1]
|
||||||
assert list(j2i) == [-1, 2, -1, -1]
|
assert list(j2i) == [-1, 2, -1, -1]
|
||||||
|
|
||||||
|
|
||||||
def test_align_many_to_one():
|
def test_align_many_to_one():
|
||||||
words1 = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
|
words1 = ["a", "b", "c", "d", "e", "f", "g", "h"]
|
||||||
words2 = ['ab', 'bc', 'e', 'fg', 'h']
|
words2 = ["ab", "bc", "e", "fg", "h"]
|
||||||
cost, i2j, j2i, matrix = align(words1, words2)
|
cost, i2j, j2i, matrix = align(words1, words2)
|
||||||
assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
|
assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
|
||||||
lengths1 = [len(w) for w in words1]
|
lengths1 = [len(w) for w in words1]
|
||||||
|
|
|
@ -8,75 +8,78 @@ from .util import get_doc
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_U(en_vocab):
|
def test_gold_biluo_U(en_vocab):
|
||||||
orths_and_spaces = [('I', True), ('flew', True), ('to', True),
|
words = ["I", "flew", "to", "London", "."]
|
||||||
('London', False), ('.', True)]
|
spaces = [True, True, True, False, True]
|
||||||
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to London"), 'LOC')]
|
entities = [(len("I flew to "), len("I flew to London"), "LOC")]
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
tags = biluo_tags_from_offsets(doc, entities)
|
||||||
assert tags == ['O', 'O', 'O', 'U-LOC', 'O']
|
assert tags == ["O", "O", "O", "U-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_BL(en_vocab):
|
def test_gold_biluo_BL(en_vocab):
|
||||||
orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
|
words = ["I", "flew", "to", "San", "Francisco", "."]
|
||||||
('Francisco', False), ('.', True)]
|
spaces = [True, True, True, True, False, True]
|
||||||
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco"), 'LOC')]
|
entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
tags = biluo_tags_from_offsets(doc, entities)
|
||||||
assert tags == ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O']
|
assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_BIL(en_vocab):
|
def test_gold_biluo_BIL(en_vocab):
|
||||||
orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
|
words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
|
||||||
('Francisco', True), ('Valley', False), ('.', True)]
|
spaces = [True, True, True, True, True, False, True]
|
||||||
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
|
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
tags = biluo_tags_from_offsets(doc, entities)
|
||||||
assert tags == ['O', 'O', 'O', 'B-LOC', 'I-LOC', 'L-LOC', 'O']
|
assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_misalign(en_vocab):
|
def test_gold_biluo_misalign(en_vocab):
|
||||||
orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
|
words = ["I", "flew", "to", "San", "Francisco", "Valley."]
|
||||||
('Francisco', True), ('Valley.', False)]
|
spaces = [True, True, True, True, True, False]
|
||||||
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
|
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
tags = biluo_tags_from_offsets(doc, entities)
|
||||||
assert tags == ['O', 'O', 'O', '-', '-', '-']
|
assert tags == ["O", "O", "O", "-", "-", "-"]
|
||||||
|
|
||||||
|
|
||||||
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
||||||
text = "I flew to Silicon Valley via London."
|
text = "I flew to Silicon Valley via London."
|
||||||
biluo_tags = ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O', 'U-GPE', 'O']
|
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||||
offsets = [(10, 24, 'LOC'), (29, 35, 'GPE')]
|
offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
|
biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
|
||||||
assert biluo_tags_converted == biluo_tags
|
assert biluo_tags_converted == biluo_tags
|
||||||
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
||||||
assert offsets_converted == offsets
|
assert offsets_converted == offsets
|
||||||
|
|
||||||
|
|
||||||
def test_docs_to_json(en_vocab):
|
def test_docs_to_json(en_vocab):
|
||||||
'''Test we can convert a list of Doc objects into the JSON-serializable
|
"""Test we can convert a list of Doc objects into the JSON-serializable
|
||||||
format we use for training.
|
format we use for training.
|
||||||
'''
|
"""
|
||||||
docs = [
|
docs = [
|
||||||
get_doc(
|
get_doc(
|
||||||
en_vocab,
|
en_vocab,
|
||||||
words=['a', 'b'],
|
words=["a", "b"],
|
||||||
pos=['VBP', 'NN'],
|
pos=["VBP", "NN"],
|
||||||
heads=[0, -1],
|
heads=[0, -1],
|
||||||
deps=['ROOT', 'dobj'],
|
deps=["ROOT", "dobj"],
|
||||||
ents=[]),
|
ents=[],
|
||||||
|
),
|
||||||
get_doc(
|
get_doc(
|
||||||
en_vocab,
|
en_vocab,
|
||||||
words=['c', 'd', 'e'],
|
words=["c", "d", "e"],
|
||||||
pos=['VBP', 'NN', 'NN'],
|
pos=["VBP", "NN", "NN"],
|
||||||
heads=[0, -1, -2],
|
heads=[0, -1, -2],
|
||||||
deps=['ROOT', 'dobj', 'dobj'],
|
deps=["ROOT", "dobj", "dobj"],
|
||||||
ents=[(1, 2, 'ORG')]),
|
ents=[(1, 2, "ORG")],
|
||||||
|
),
|
||||||
]
|
]
|
||||||
json_doc = docs_to_json(0, docs)
|
json_doc = docs_to_json(0, docs)
|
||||||
assert json_doc['id'] == 0
|
assert json_doc["id"] == 0
|
||||||
assert len(json_doc['paragraphs']) == 2
|
assert len(json_doc["paragraphs"]) == 2
|
||||||
assert len(json_doc['paragraphs'][0]['sentences']) == 1
|
assert len(json_doc["paragraphs"][0]["sentences"]) == 1
|
||||||
assert len(json_doc['paragraphs'][1]['sentences']) == 1
|
assert len(json_doc["paragraphs"][1]["sentences"]) == 1
|
||||||
assert len(json_doc['paragraphs'][0]['sentences'][0]['tokens']) == 2
|
assert len(json_doc["paragraphs"][0]["sentences"][0]["tokens"]) == 2
|
||||||
assert len(json_doc['paragraphs'][1]['sentences'][0]['tokens']) == 3
|
assert len(json_doc["paragraphs"][1]["sentences"][0]["tokens"]) == 3
|
||||||
|
|
|
@ -11,19 +11,19 @@ from spacy._ml import PrecomputableAffine
|
||||||
from .util import get_doc
|
from .util import get_doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ['hello/world', 'hello world'])
|
@pytest.mark.parametrize("text", ["hello/world", "hello world"])
|
||||||
def test_util_ensure_path_succeeds(text):
|
def test_util_ensure_path_succeeds(text):
|
||||||
path = util.ensure_path(text)
|
path = util.ensure_path(text)
|
||||||
assert isinstance(path, Path)
|
assert isinstance(path, Path)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('package', ['numpy'])
|
@pytest.mark.parametrize("package", ["numpy"])
|
||||||
def test_util_is_package(package):
|
def test_util_is_package(package):
|
||||||
"""Test that an installed package via pip is recognised by util.is_package."""
|
"""Test that an installed package via pip is recognised by util.is_package."""
|
||||||
assert util.is_package(package)
|
assert util.is_package(package)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('package', ['thinc'])
|
@pytest.mark.parametrize("package", ["thinc"])
|
||||||
def test_util_get_package_path(package):
|
def test_util_get_package_path(package):
|
||||||
"""Test that a Path object is returned for a package name."""
|
"""Test that a Path object is returned for a package name."""
|
||||||
path = util.get_package_path(package)
|
path = util.get_package_path(package)
|
||||||
|
@ -33,44 +33,47 @@ def test_util_get_package_path(package):
|
||||||
def test_displacy_parse_ents(en_vocab):
|
def test_displacy_parse_ents(en_vocab):
|
||||||
"""Test that named entities on a Doc are converted into displaCy's format."""
|
"""Test that named entities on a Doc are converted into displaCy's format."""
|
||||||
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings['ORG'])]
|
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
||||||
ents = displacy.parse_ents(doc)
|
ents = displacy.parse_ents(doc)
|
||||||
assert isinstance(ents, dict)
|
assert isinstance(ents, dict)
|
||||||
assert ents['text'] == 'But Google is starting from behind '
|
assert ents["text"] == "But Google is starting from behind "
|
||||||
assert ents['ents'] == [{'start': 4, 'end': 10, 'label': 'ORG'}]
|
assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}]
|
||||||
|
|
||||||
|
|
||||||
def test_displacy_parse_deps(en_vocab):
|
def test_displacy_parse_deps(en_vocab):
|
||||||
"""Test that deps and tags on a Doc are converted into displaCy's format."""
|
"""Test that deps and tags on a Doc are converted into displaCy's format."""
|
||||||
words = ["This", "is", "a", "sentence"]
|
words = ["This", "is", "a", "sentence"]
|
||||||
heads = [1, 0, 1, -2]
|
heads = [1, 0, 1, -2]
|
||||||
pos = ['DET', 'VERB', 'DET', 'NOUN']
|
pos = ["DET", "VERB", "DET", "NOUN"]
|
||||||
tags = ['DT', 'VBZ', 'DT', 'NN']
|
tags = ["DT", "VBZ", "DT", "NN"]
|
||||||
deps = ['nsubj', 'ROOT', 'det', 'attr']
|
deps = ["nsubj", "ROOT", "det", "attr"]
|
||||||
doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags,
|
doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
|
||||||
deps=deps)
|
|
||||||
deps = displacy.parse_deps(doc)
|
deps = displacy.parse_deps(doc)
|
||||||
assert isinstance(deps, dict)
|
assert isinstance(deps, dict)
|
||||||
assert deps['words'] == [{'text': 'This', 'tag': 'DET'},
|
assert deps["words"] == [
|
||||||
{'text': 'is', 'tag': 'VERB'},
|
{"text": "This", "tag": "DET"},
|
||||||
{'text': 'a', 'tag': 'DET'},
|
{"text": "is", "tag": "VERB"},
|
||||||
{'text': 'sentence', 'tag': 'NOUN'}]
|
{"text": "a", "tag": "DET"},
|
||||||
assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
|
{"text": "sentence", "tag": "NOUN"},
|
||||||
{'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
|
]
|
||||||
{'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
|
assert deps["arcs"] == [
|
||||||
|
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
||||||
|
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
||||||
|
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_displacy_spans(en_vocab):
|
def test_displacy_spans(en_vocab):
|
||||||
"""Test that displaCy can render Spans."""
|
"""Test that displaCy can render Spans."""
|
||||||
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||||
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings['ORG'])]
|
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
||||||
html = displacy.render(doc[1:4], style='ent')
|
html = displacy.render(doc[1:4], style="ent")
|
||||||
assert html.startswith('<div')
|
assert html.startswith("<div")
|
||||||
|
|
||||||
|
|
||||||
def test_displacy_raises_for_wrong_type(en_vocab):
|
def test_displacy_raises_for_wrong_type(en_vocab):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
html = displacy.render('hello world')
|
html = displacy.render("hello world")
|
||||||
|
|
||||||
|
|
||||||
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
||||||
|
@ -78,22 +81,22 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
||||||
assert model.W.shape == (nF, nO, nP, nI)
|
assert model.W.shape == (nF, nO, nP, nI)
|
||||||
tensor = model.ops.allocate((10, nI))
|
tensor = model.ops.allocate((10, nI))
|
||||||
Y, get_dX = model.begin_update(tensor)
|
Y, get_dX = model.begin_update(tensor)
|
||||||
assert Y.shape == (tensor.shape[0]+1, nF, nO, nP)
|
assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
|
||||||
assert model.d_pad.shape == (1, nF, nO, nP)
|
assert model.d_pad.shape == (1, nF, nO, nP)
|
||||||
dY = model.ops.allocate((15, nO, nP))
|
dY = model.ops.allocate((15, nO, nP))
|
||||||
ids = model.ops.allocate((15, nF))
|
ids = model.ops.allocate((15, nF))
|
||||||
ids[1,2] = -1
|
ids[1, 2] = -1
|
||||||
dY[1] = 1
|
dY[1] = 1
|
||||||
assert model.d_pad[0, 2, 0, 0] == 0.
|
assert model.d_pad[0, 2, 0, 0] == 0.0
|
||||||
model._backprop_padding(dY, ids)
|
model._backprop_padding(dY, ids)
|
||||||
assert model.d_pad[0, 2, 0, 0] == 1.
|
assert model.d_pad[0, 2, 0, 0] == 1.0
|
||||||
model.d_pad.fill(0.)
|
model.d_pad.fill(0.0)
|
||||||
ids.fill(0.)
|
ids.fill(0.0)
|
||||||
dY.fill(0.)
|
dY.fill(0.0)
|
||||||
ids[1,2] = -1
|
ids[1, 2] = -1
|
||||||
ids[1,1] = -1
|
ids[1, 1] = -1
|
||||||
ids[1,0] = -1
|
ids[1, 0] = -1
|
||||||
dY[1] = 1
|
dY[1] = 1
|
||||||
assert model.d_pad[0, 2, 0, 0] == 0.
|
assert model.d_pad[0, 2, 0, 0] == 0.0
|
||||||
model._backprop_padding(dY, ids)
|
model._backprop_padding(dY, ids)
|
||||||
assert model.d_pad[0, 2, 0, 0] == 3.
|
assert model.d_pad[0, 2, 0, 0] == 3.0
|
||||||
|
|
|
@ -9,7 +9,7 @@ from spacy.vocab import Vocab
|
||||||
from spacy.attrs import NORM
|
from spacy.attrs import NORM
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text1,text2', [('hello', 'bye')])
|
@pytest.mark.parametrize("text1,text2", [("hello", "bye")])
|
||||||
def test_pickle_string_store(text1, text2):
|
def test_pickle_string_store(text1, text2):
|
||||||
stringstore = StringStore()
|
stringstore = StringStore()
|
||||||
store1 = stringstore[text1]
|
store1 = stringstore[text1]
|
||||||
|
@ -21,10 +21,10 @@ def test_pickle_string_store(text1, text2):
|
||||||
assert len(stringstore) == len(unpickled)
|
assert len(stringstore) == len(unpickled)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text1,text2', [('dog', 'cat')])
|
@pytest.mark.parametrize("text1,text2", [("dog", "cat")])
|
||||||
def test_pickle_vocab(text1, text2):
|
def test_pickle_vocab(text1, text2):
|
||||||
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
|
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
|
||||||
vocab.set_vector('dog', numpy.ones((5,), dtype='f'))
|
vocab.set_vector("dog", numpy.ones((5,), dtype="f"))
|
||||||
lex1 = vocab[text1]
|
lex1 = vocab[text1]
|
||||||
lex2 = vocab[text2]
|
lex2 = vocab[text2]
|
||||||
assert lex1.norm_ == text1[:-1]
|
assert lex1.norm_ == text1[:-1]
|
||||||
|
@ -37,4 +37,4 @@ def test_pickle_vocab(text1, text2):
|
||||||
assert unpickled[text2].norm == lex2.norm
|
assert unpickled[text2].norm == lex2.norm
|
||||||
assert unpickled[text1].norm != unpickled[text2].norm
|
assert unpickled[text1].norm != unpickled[text2].norm
|
||||||
assert unpickled.vectors is not None
|
assert unpickled.vectors is not None
|
||||||
assert list(vocab['dog'].vector) == [1.,1.,1.,1.,1.]
|
assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0]
|
||||||
|
|
|
@ -29,17 +29,19 @@ def test_tokenizer_handles_emoticons(tokenizer):
|
||||||
assert tokens[17].text == ":D"
|
assert tokens[17].text == ":D"
|
||||||
assert tokens[18].text == "=|"
|
assert tokens[18].text == "=|"
|
||||||
assert tokens[19].text == '")'
|
assert tokens[19].text == '")'
|
||||||
assert tokens[20].text == ':>'
|
assert tokens[20].text == ":>"
|
||||||
assert tokens[21].text == '....'
|
assert tokens[21].text == "...."
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
|
@pytest.mark.parametrize("text,length", [("example:)", 3), ("108)", 2), ("XDN", 1)])
|
||||||
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
|
|
||||||
('i💙you', 3), ('🤘🤘yay!', 4)])
|
@pytest.mark.parametrize(
|
||||||
|
"text,length", [("can you still dunk?🍕🍔😵LOL", 8), ("i💙you", 3), ("🤘🤘yay!", 4)]
|
||||||
|
)
|
||||||
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
def test_tokenizer_handles_emoji(tokenizer, text, length):
|
||||||
# These break on narrow unicode builds, e.g. Windows
|
# These break on narrow unicode builds, e.g. Windows
|
||||||
if sys.maxunicode >= 1114111:
|
if sys.maxunicode >= 1114111:
|
||||||
|
|
|
@ -12,11 +12,9 @@ NAUGHTY_STRINGS = [
|
||||||
",./;'[]\-=",
|
",./;'[]\-=",
|
||||||
'<>?:"{}|_+',
|
'<>?:"{}|_+',
|
||||||
'!@#$%^&*()`~"',
|
'!@#$%^&*()`~"',
|
||||||
|
|
||||||
# Unicode additional control characters, byte order marks
|
# Unicode additional control characters, byte order marks
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
|
|
||||||
# Unicode Symbols
|
# Unicode Symbols
|
||||||
"Ω≈ç√∫˜µ≤≥÷",
|
"Ω≈ç√∫˜µ≤≥÷",
|
||||||
"åß∂ƒ©˙∆˚¬…æ",
|
"åß∂ƒ©˙∆˚¬…æ",
|
||||||
|
@ -29,13 +27,11 @@ NAUGHTY_STRINGS = [
|
||||||
"⅛⅜⅝⅞",
|
"⅛⅜⅝⅞",
|
||||||
"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
|
"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
|
||||||
"٠١٢٣٤٥٦٧٨٩",
|
"٠١٢٣٤٥٦٧٨٩",
|
||||||
|
|
||||||
# Unicode Subscript/Superscript/Accents
|
# Unicode Subscript/Superscript/Accents
|
||||||
"⁰⁴⁵",
|
"⁰⁴⁵",
|
||||||
"₀₁₂",
|
"₀₁₂",
|
||||||
"⁰⁴⁵₀₁₂",
|
"⁰⁴⁵₀₁₂",
|
||||||
"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
|
"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
|
||||||
|
|
||||||
# Two-Byte Characters
|
# Two-Byte Characters
|
||||||
"田中さんにあげて下さい",
|
"田中さんにあげて下さい",
|
||||||
"パーティーへ行かないか",
|
"パーティーへ行かないか",
|
||||||
|
@ -46,7 +42,6 @@ NAUGHTY_STRINGS = [
|
||||||
"社會科學院語學研究所",
|
"社會科學院語學研究所",
|
||||||
"울란바토르",
|
"울란바토르",
|
||||||
"𠜎𠜱𠝹𠱓𠱸𠲖𠳏",
|
"𠜎𠜱𠝹𠱓𠱸𠲖𠳏",
|
||||||
|
|
||||||
# Japanese Emoticons
|
# Japanese Emoticons
|
||||||
"ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ",
|
"ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ",
|
||||||
"(。◕ ∀ ◕。)",
|
"(。◕ ∀ ◕。)",
|
||||||
|
@ -55,11 +50,9 @@ NAUGHTY_STRINGS = [
|
||||||
"・( ̄∀ ̄)・:*:",
|
"・( ̄∀ ̄)・:*:",
|
||||||
"゚・✿ヾ╲(。◕‿◕。)╱✿・゚",
|
"゚・✿ヾ╲(。◕‿◕。)╱✿・゚",
|
||||||
",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’",
|
",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’",
|
||||||
"(╯°□°)╯︵ ┻━┻)"
|
"(╯°□°)╯︵ ┻━┻)" "(ノಥ益ಥ)ノ ┻━┻",
|
||||||
"(ノಥ益ಥ)ノ ┻━┻",
|
|
||||||
"┬─┬ノ( º _ ºノ)",
|
"┬─┬ノ( º _ ºノ)",
|
||||||
"( ͡° ͜ʖ ͡°)",
|
"( ͡° ͜ʖ ͡°)",
|
||||||
|
|
||||||
# Emoji
|
# Emoji
|
||||||
"😍",
|
"😍",
|
||||||
"👩🏽",
|
"👩🏽",
|
||||||
|
@ -69,18 +62,14 @@ NAUGHTY_STRINGS = [
|
||||||
"✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿",
|
"✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿",
|
||||||
"🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧",
|
"🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧",
|
||||||
"0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟",
|
"0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟",
|
||||||
|
|
||||||
# Regional Indicator Symbols
|
# Regional Indicator Symbols
|
||||||
"🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸",
|
"🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸",
|
||||||
"🇺🇸🇷🇺🇸🇦🇫🇦🇲",
|
"🇺🇸🇷🇺🇸🇦🇫🇦🇲",
|
||||||
"🇺🇸🇷🇺🇸🇦",
|
"🇺🇸🇷🇺🇸🇦",
|
||||||
|
|
||||||
# Unicode Numbers
|
# Unicode Numbers
|
||||||
"123",
|
"123",
|
||||||
"١٢٣",
|
"١٢٣",
|
||||||
|
|
||||||
# Right-To-Left Strings
|
# Right-To-Left Strings
|
||||||
|
|
||||||
"ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.",
|
"ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.",
|
||||||
"إيو.",
|
"إيو.",
|
||||||
"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ",
|
"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ",
|
||||||
|
@ -88,34 +77,21 @@ NAUGHTY_STRINGS = [
|
||||||
"﷽",
|
"﷽",
|
||||||
"ﷺ",
|
"ﷺ",
|
||||||
"مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،",
|
"مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،",
|
||||||
|
|
||||||
# Trick Unicode
|
# Trick Unicode
|
||||||
"test",
|
"test",
|
||||||
"test",
|
"test",
|
||||||
"
test
",
|
"
test
",
|
||||||
"testtest",
|
"testtest",
|
||||||
"test",
|
"test",
|
||||||
|
|
||||||
# Zalgo Text
|
# Zalgo Text
|
||||||
"Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣",
|
"Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣",
|
||||||
|
|
||||||
|
|
||||||
"̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰",
|
"̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰",
|
||||||
|
|
||||||
|
|
||||||
"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
|
"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
|
||||||
|
|
||||||
|
|
||||||
"̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕",
|
"̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕",
|
||||||
|
|
||||||
|
|
||||||
"Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮",
|
"Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮",
|
||||||
|
|
||||||
|
|
||||||
# Unicode Upsidedown
|
# Unicode Upsidedown
|
||||||
"˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥",
|
"˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥",
|
||||||
"00˙Ɩ$-",
|
"00˙Ɩ$-",
|
||||||
|
|
||||||
# Unicode font
|
# Unicode font
|
||||||
"The quick brown fox jumps over the lazy dog",
|
"The quick brown fox jumps over the lazy dog",
|
||||||
"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠",
|
"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠",
|
||||||
|
@ -125,19 +101,17 @@ NAUGHTY_STRINGS = [
|
||||||
"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘",
|
"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘",
|
||||||
"𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐",
|
"𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐",
|
||||||
"⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢",
|
"⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢",
|
||||||
|
|
||||||
# File paths
|
# File paths
|
||||||
"../../../../../../../../../../../etc/passwd%00",
|
"../../../../../../../../../../../etc/passwd%00",
|
||||||
"../../../../../../../../../../../etc/hosts",
|
"../../../../../../../../../../../etc/hosts",
|
||||||
|
|
||||||
# iOS Vulnerabilities
|
# iOS Vulnerabilities
|
||||||
"Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗",
|
"Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗",
|
||||||
"🏳0🌈️"
|
"🏳0🌈️",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
@pytest.mark.parametrize('text', NAUGHTY_STRINGS)
|
@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
|
||||||
def test_tokenizer_naughty_strings(tokenizer, text):
|
def test_tokenizer_naughty_strings(tokenizer, text):
|
||||||
tokens = tokenizer(text)
|
tokens = tokenizer(text)
|
||||||
assert tokens.text_with_ws == text
|
assert tokens.text_with_ws == text
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user