💫 Tidy up and auto-format tests (#2967)

* Auto-format tests with black

* Add flake8 config

* Tidy up and remove unused imports

* Fix redefinitions of test functions

* Replace orths_and_spaces with words and spaces

* Fix compatibility with pytest 4.0

* xfail test for now

Test was previously overwritten by following test due to naming conflict, so failure wasn't reported

* Unfail passing test

* Only use fixture via arguments

Fixes pytest 4.0 compatibility
This commit is contained in:
Ines Montani 2018-11-27 01:09:36 +01:00 committed by Matthew Honnibal
parent 2c37e0ccf6
commit b6e991440c
109 changed files with 2267 additions and 1706 deletions

4
.flake8 Normal file
View File

@ -0,0 +1,4 @@
[flake8]
ignore = E203, E266, E501, W503
max-line-length = 80
select = B,C,E,F,W,T4,B9

View File

@ -11,7 +11,7 @@ ujson>=1.35
dill>=0.2,<0.3
regex==2018.01.10
requests>=2.13.0,<3.0.0
pytest>=3.6.0,<4.0.0
pytest>=4.0.0,<5.0.0
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
pathlib==1.0.1; python_version < "3.4"

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals
import pytest
from io import StringIO, BytesIO
from spacy.util import get_lang_class
@ -11,126 +10,135 @@ def pytest_addoption(parser):
def pytest_runtest_setup(item):
for opt in ['slow']:
for opt in ["slow"]:
if opt in item.keywords and not item.config.getoption("--%s" % opt):
pytest.skip("need --%s option to run" % opt)
@pytest.fixture(scope='module')
@pytest.fixture(scope="module")
def tokenizer():
return get_lang_class('xx').Defaults.create_tokenizer()
return get_lang_class("xx").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def en_tokenizer():
return get_lang_class('en').Defaults.create_tokenizer()
return get_lang_class("en").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def en_vocab():
return get_lang_class('en').Defaults.create_vocab()
return get_lang_class("en").Defaults.create_vocab()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def en_parser(en_vocab):
nlp = get_lang_class('en')(en_vocab)
return nlp.create_pipe('parser')
nlp = get_lang_class("en")(en_vocab)
return nlp.create_pipe("parser")
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def es_tokenizer():
return get_lang_class('es').Defaults.create_tokenizer()
return get_lang_class("es").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def de_tokenizer():
return get_lang_class('de').Defaults.create_tokenizer()
return get_lang_class("de").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def fr_tokenizer():
return get_lang_class('fr').Defaults.create_tokenizer()
return get_lang_class("fr").Defaults.create_tokenizer()
@pytest.fixture
def hu_tokenizer():
return get_lang_class('hu').Defaults.create_tokenizer()
return get_lang_class("hu").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def fi_tokenizer():
return get_lang_class('fi').Defaults.create_tokenizer()
return get_lang_class("fi").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def ro_tokenizer():
return get_lang_class('ro').Defaults.create_tokenizer()
return get_lang_class("ro").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def id_tokenizer():
return get_lang_class('id').Defaults.create_tokenizer()
return get_lang_class("id").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def sv_tokenizer():
return get_lang_class('sv').Defaults.create_tokenizer()
return get_lang_class("sv").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def bn_tokenizer():
return get_lang_class('bn').Defaults.create_tokenizer()
return get_lang_class("bn").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def ga_tokenizer():
return get_lang_class('ga').Defaults.create_tokenizer()
return get_lang_class("ga").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def he_tokenizer():
return get_lang_class('he').Defaults.create_tokenizer()
return get_lang_class("he").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def nb_tokenizer():
return get_lang_class('nb').Defaults.create_tokenizer()
return get_lang_class("nb").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def da_tokenizer():
return get_lang_class('da').Defaults.create_tokenizer()
return get_lang_class("da").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def ja_tokenizer():
mecab = pytest.importorskip("MeCab")
return get_lang_class('ja').Defaults.create_tokenizer()
return get_lang_class("ja").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp")
return get_lang_class('th').Defaults.create_tokenizer()
return get_lang_class("th").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def tr_tokenizer():
return get_lang_class('tr').Defaults.create_tokenizer()
return get_lang_class("tr").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def tt_tokenizer():
return get_lang_class('tt').Defaults.create_tokenizer()
return get_lang_class("tt").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def el_tokenizer():
return get_lang_class('el').Defaults.create_tokenizer()
return get_lang_class("el").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def ar_tokenizer():
return get_lang_class('ar').Defaults.create_tokenizer()
return get_lang_class("ar").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def ur_tokenizer():
return get_lang_class('ur').Defaults.create_tokenizer()
return get_lang_class("ur").Defaults.create_tokenizer()
@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def ru_tokenizer():
pymorphy = pytest.importorskip('pymorphy2')
return get_lang_class('ru').Defaults.create_tokenizer()
pymorphy = pytest.importorskip("pymorphy2")
return get_lang_class("ru").Defaults.create_tokenizer()

View File

@ -38,7 +38,7 @@ def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab):
def test_doc_array_tag(en_tokenizer):
text = "A nice sentence."
pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']
pos = ["DET", "ADJ", "NOUN", "PUNCT"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], pos=pos)
assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos
@ -51,7 +51,7 @@ def test_doc_array_tag(en_tokenizer):
def test_doc_array_dep(en_tokenizer):
text = "A nice sentence."
deps = ['det', 'amod', 'ROOT', 'punct']
deps = ["det", "amod", "ROOT", "punct"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
feats_array = doc.to_array((ORTH, DEP))

View File

@ -9,7 +9,7 @@ from spacy.lemmatizer import Lemmatizer
@pytest.fixture
def lemmatizer():
return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
return Lemmatizer(lookup={"dogs": "dog", "boxen": "box", "mice": "mouse"})
@pytest.fixture
@ -23,15 +23,15 @@ def test_empty_doc(vocab):
def test_single_word(vocab):
doc = Doc(vocab, words=['a'])
assert doc.text == 'a '
doc = Doc(vocab, words=['a'], spaces=[False])
assert doc.text == 'a'
doc = Doc(vocab, words=["a"])
assert doc.text == "a "
doc = Doc(vocab, words=["a"], spaces=[False])
assert doc.text == "a"
def test_lookup_lemmatization(vocab):
doc = Doc(vocab, words=['dogs', 'dogses'])
assert doc[0].text == 'dogs'
assert doc[0].lemma_ == 'dog'
assert doc[1].text == 'dogses'
assert doc[1].lemma_ == 'dogses'
doc = Doc(vocab, words=["dogs", "dogses"])
assert doc[0].text == "dogs"
assert doc[0].lemma_ == "dog"
assert doc[1].text == "dogses"
assert doc[1].lemma_ == "dogses"

View File

@ -10,7 +10,7 @@ from spacy.attrs import LEMMA
from ..util import get_doc
@pytest.mark.parametrize('text', [["one", "two", "three"]])
@pytest.mark.parametrize("text", [["one", "two", "three"]])
def test_doc_api_compare_by_string_position(en_vocab, text):
doc = Doc(en_vocab, words=text)
# Get the tokens in this order, so their ID ordering doesn't match the idx
@ -28,80 +28,81 @@ def test_doc_api_compare_by_string_position(en_vocab, text):
def test_doc_api_getitem(en_tokenizer):
text = "Give it back! He pleaded."
tokens = en_tokenizer(text)
assert tokens[0].text == 'Give'
assert tokens[-1].text == '.'
assert tokens[0].text == "Give"
assert tokens[-1].text == "."
with pytest.raises(IndexError):
tokens[len(tokens)]
def to_str(span):
return '/'.join(token.text for token in span)
return "/".join(token.text for token in span)
span = tokens[1:1]
assert not to_str(span)
span = tokens[1:4]
assert to_str(span) == 'it/back/!'
assert to_str(span) == "it/back/!"
span = tokens[1:4:1]
assert to_str(span) == 'it/back/!'
assert to_str(span) == "it/back/!"
with pytest.raises(ValueError):
tokens[1:4:2]
with pytest.raises(ValueError):
tokens[1:4:-1]
span = tokens[-3:6]
assert to_str(span) == 'He/pleaded'
assert to_str(span) == "He/pleaded"
span = tokens[4:-1]
assert to_str(span) == 'He/pleaded'
assert to_str(span) == "He/pleaded"
span = tokens[-5:-3]
assert to_str(span) == 'back/!'
assert to_str(span) == "back/!"
span = tokens[5:4]
assert span.start == span.end == 5 and not to_str(span)
span = tokens[4:-3]
assert span.start == span.end == 4 and not to_str(span)
span = tokens[:]
assert to_str(span) == 'Give/it/back/!/He/pleaded/.'
assert to_str(span) == "Give/it/back/!/He/pleaded/."
span = tokens[4:]
assert to_str(span) == 'He/pleaded/.'
assert to_str(span) == "He/pleaded/."
span = tokens[:4]
assert to_str(span) == 'Give/it/back/!'
assert to_str(span) == "Give/it/back/!"
span = tokens[:-3]
assert to_str(span) == 'Give/it/back/!'
assert to_str(span) == "Give/it/back/!"
span = tokens[-3:]
assert to_str(span) == 'He/pleaded/.'
assert to_str(span) == "He/pleaded/."
span = tokens[4:50]
assert to_str(span) == 'He/pleaded/.'
assert to_str(span) == "He/pleaded/."
span = tokens[-50:4]
assert to_str(span) == 'Give/it/back/!'
assert to_str(span) == "Give/it/back/!"
span = tokens[-50:-40]
assert span.start == span.end == 0 and not to_str(span)
span = tokens[40:50]
assert span.start == span.end == 7 and not to_str(span)
span = tokens[1:4]
assert span[0].orth_ == 'it'
assert span[0].orth_ == "it"
subspan = span[:]
assert to_str(subspan) == 'it/back/!'
assert to_str(subspan) == "it/back/!"
subspan = span[:2]
assert to_str(subspan) == 'it/back'
assert to_str(subspan) == "it/back"
subspan = span[1:]
assert to_str(subspan) == 'back/!'
assert to_str(subspan) == "back/!"
subspan = span[:-1]
assert to_str(subspan) == 'it/back'
assert to_str(subspan) == "it/back"
subspan = span[-2:]
assert to_str(subspan) == 'back/!'
assert to_str(subspan) == "back/!"
subspan = span[1:2]
assert to_str(subspan) == 'back'
assert to_str(subspan) == "back"
subspan = span[-2:-1]
assert to_str(subspan) == 'back'
assert to_str(subspan) == "back"
subspan = span[-50:50]
assert to_str(subspan) == 'it/back/!'
assert to_str(subspan) == "it/back/!"
subspan = span[50:-50]
assert subspan.start == subspan.end == 4 and not to_str(subspan)
@pytest.mark.parametrize('text', ["Give it back! He pleaded.",
" Give it back! He pleaded. "])
@pytest.mark.parametrize(
"text", ["Give it back! He pleaded.", " Give it back! He pleaded. "]
)
def test_doc_api_serialize(en_tokenizer, text):
tokens = en_tokenizer(text)
new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes())
@ -110,13 +111,15 @@ def test_doc_api_serialize(en_tokenizer, text):
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
new_tokens = Doc(tokens.vocab).from_bytes(
tokens.to_bytes(tensor=False), tensor=False)
tokens.to_bytes(tensor=False), tensor=False
)
assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
new_tokens = Doc(tokens.vocab).from_bytes(
tokens.to_bytes(sentiment=False), sentiment=False)
tokens.to_bytes(sentiment=False), sentiment=False
)
assert tokens.text == new_tokens.text
assert [t.text for t in tokens] == [t.text for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
@ -126,10 +129,10 @@ def test_doc_api_set_ents(en_tokenizer):
text = "I use goggle chrone to surf the web"
tokens = en_tokenizer(text)
assert len(tokens.ents) == 0
tokens.ents = [(tokens.vocab.strings['PRODUCT'], 2, 4)]
tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)]
assert len(list(tokens.ents)) == 1
assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0]
assert tokens.ents[0].label_ == 'PRODUCT'
assert tokens.ents[0].label_ == "PRODUCT"
assert tokens.ents[0].start == 2
assert tokens.ents[0].end == 4
@ -140,21 +143,31 @@ def test_doc_api_merge(en_tokenizer):
# merge 'The Beach Boys'
doc = en_tokenizer(text)
assert len(doc) == 9
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), tag='NAMED', lemma='LEMMA',
ent_type='TYPE')
doc.merge(
doc[4].idx,
doc[6].idx + len(doc[6]),
tag="NAMED",
lemma="LEMMA",
ent_type="TYPE",
)
assert len(doc) == 7
assert doc[4].text == 'the beach boys'
assert doc[4].text_with_ws == 'the beach boys '
assert doc[4].tag_ == 'NAMED'
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
# merge 'all night'
doc = en_tokenizer(text)
assert len(doc) == 9
doc.merge(doc[7].idx, doc[8].idx + len(doc[8]), tag='NAMED', lemma='LEMMA',
ent_type='TYPE')
doc.merge(
doc[7].idx,
doc[8].idx + len(doc[8]),
tag="NAMED",
lemma="LEMMA",
ent_type="TYPE",
)
assert len(doc) == 8
assert doc[7].text == 'all night'
assert doc[7].text_with_ws == 'all night'
assert doc[7].text == "all night"
assert doc[7].text_with_ws == "all night"
def test_doc_api_merge_children(en_tokenizer):
@ -162,8 +175,13 @@ def test_doc_api_merge_children(en_tokenizer):
text = "WKRO played songs by the beach boys all night"
doc = en_tokenizer(text)
assert len(doc) == 9
doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), tag='NAMED', lemma='LEMMA',
ent_type='TYPE')
doc.merge(
doc[4].idx,
doc[6].idx + len(doc[6]),
tag="NAMED",
lemma="LEMMA",
ent_type="TYPE",
)
for word in doc:
if word.i < word.head.i:
@ -175,8 +193,8 @@ def test_doc_api_merge_children(en_tokenizer):
def test_doc_api_merge_hang(en_tokenizer):
text = "through North and South Carolina"
doc = en_tokenizer(text)
doc.merge(18, 32, tag='', lemma='', ent_type='ORG')
doc.merge(8, 32, tag='', lemma='', ent_type='ORG')
doc.merge(18, 32, tag="", lemma="", ent_type="ORG")
doc.merge(8, 32, tag="", lemma="", ent_type="ORG")
def test_doc_api_retokenizer(en_tokenizer):
@ -184,19 +202,19 @@ def test_doc_api_retokenizer(en_tokenizer):
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7])
assert len(doc) == 7
assert doc[4].text == 'the beach boys'
assert doc[4].text == "the beach boys"
def test_doc_api_retokenizer_attrs(en_tokenizer):
doc = en_tokenizer("WKRO played songs by the beach boys all night")
# test both string and integer attributes and values
attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']}
attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]}
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4:7], attrs=attrs)
assert len(doc) == 7
assert doc[4].text == 'the beach boys'
assert doc[4].lemma_ == 'boys'
assert doc[4].ent_type_ == 'ORG'
assert doc[4].text == "the beach boys"
assert doc[4].lemma_ == "boys"
assert doc[4].ent_type_ == "ORG"
@pytest.mark.xfail
@ -205,11 +223,11 @@ def test_doc_api_retokenizer_lex_attrs(en_tokenizer):
doc = en_tokenizer("WKRO played beach boys songs")
assert not any(token.is_stop for token in doc)
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[2:4], attrs={'LEMMA': 'boys', 'IS_STOP': True})
assert doc[2].text == 'beach boys'
assert doc[2].lemma_ == 'boys'
retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True})
assert doc[2].text == "beach boys"
assert doc[2].lemma_ == "boys"
assert doc[2].is_stop
new_doc = Doc(doc.vocab, words=['beach boys'])
new_doc = Doc(doc.vocab, words=["beach boys"])
assert new_doc[0].is_stop
@ -222,21 +240,25 @@ def test_doc_api_sents_empty_string(en_tokenizer):
def test_doc_api_runtime_error(en_tokenizer):
# Example that caused run-time error while parsing Reddit
# fmt: off
text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school"
deps = ['nsubj', 'prep', 'amod', 'pobj', 'ROOT', 'amod', 'attr', '',
'nummod', 'prep', 'det', 'amod', 'pobj', 'acl', 'prep', 'prep',
'pobj', '', 'nummod', 'prep', 'det', 'amod', 'pobj', 'aux', 'neg',
'ROOT', 'amod', 'dobj']
deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "",
"nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep",
"pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg",
"ROOT", "amod", "dobj"]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps)
nps = []
for np in doc.noun_chunks:
while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'):
while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"):
np = np[1:]
if len(np) > 1:
nps.append((np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_))
nps.append(
(np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_)
)
for np in nps:
start, end, tag, lemma, ent_type = np
doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type)
@ -244,57 +266,76 @@ def test_doc_api_runtime_error(en_tokenizer):
def test_doc_api_right_edge(en_tokenizer):
"""Test for bug occurring from Unshift action, causing incorrect right edge"""
# fmt: off
text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue."
heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1,
-2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
assert doc[6].text == 'for'
assert doc[6].text == "for"
subtree = [w.text for w in doc[6].subtree]
assert subtree == ['for', 'the', 'sake', 'of', 'such', 'as',
'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ',']
assert doc[6].right_edge.text == ','
assert subtree == [
"for",
"the",
"sake",
"of",
"such",
"as",
"live",
"under",
"the",
"government",
"of",
"the",
"Romans",
",",
]
assert doc[6].right_edge.text == ","
def test_doc_api_has_vector():
vocab = Vocab()
vocab.reset_vectors(width=2)
vocab.set_vector('kitten', vector=numpy.asarray([0., 2.], dtype='f'))
doc = Doc(vocab, words=['kitten'])
vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f"))
doc = Doc(vocab, words=["kitten"])
assert doc.has_vector
def test_doc_api_similarity_match():
doc = Doc(Vocab(), words=['a'])
doc = Doc(Vocab(), words=["a"])
with pytest.warns(None):
assert doc.similarity(doc[0]) == 1.0
assert doc.similarity(doc.vocab['a']) == 1.0
doc2 = Doc(doc.vocab, words=['a', 'b', 'c'])
assert doc.similarity(doc.vocab["a"]) == 1.0
doc2 = Doc(doc.vocab, words=["a", "b", "c"])
with pytest.warns(None):
assert doc.similarity(doc2[:1]) == 1.0
assert doc.similarity(doc2) == 0.0
def test_lowest_common_ancestor(en_tokenizer):
tokens = en_tokenizer('the lazy dog slept')
tokens = en_tokenizer("the lazy dog slept")
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
lca = doc.get_lca_matrix()
assert(lca[1, 1] == 1)
assert(lca[0, 1] == 2)
assert(lca[1, 2] == 2)
assert lca[1, 1] == 1
assert lca[0, 1] == 2
assert lca[1, 2] == 2
def test_parse_tree(en_tokenizer):
"""Tests doc.print_tree() method."""
text = 'I like New York in Autumn.'
text = "I like New York in Autumn."
heads = [1, 0, 1, -2, -3, -1, -5]
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags)
# full method parse_tree(text) is a trivial composition
trees = doc.print_tree()
assert len(trees) > 0
tree = trees[0]
assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers'])
assert tree['word'] == 'like' # check root is correct
assert all(
k in list(tree.keys())
for k in ["word", "lemma", "NE", "POS_fine", "POS_coarse", "arc", "modifiers"]
)
assert tree["word"] == "like" # check root is correct

View File

@ -7,37 +7,38 @@ from spacy.compat import pickle, unicode_
def test_pickle_single_doc():
nlp = Language()
doc = nlp('pickle roundtrip')
doc = nlp("pickle roundtrip")
data = pickle.dumps(doc, 1)
doc2 = pickle.loads(data)
assert doc2.text == 'pickle roundtrip'
assert doc2.text == "pickle roundtrip"
def test_list_of_docs_pickles_efficiently():
nlp = Language()
for i in range(10000):
_ = nlp.vocab[unicode_(i)]
one_pickled = pickle.dumps(nlp('0'), -1)
one_pickled = pickle.dumps(nlp("0"), -1)
docs = list(nlp.pipe(unicode_(i) for i in range(100)))
many_pickled = pickle.dumps(docs, -1)
assert len(many_pickled) < (len(one_pickled) * 2)
many_unpickled = pickle.loads(many_pickled)
assert many_unpickled[0].text == '0'
assert many_unpickled[-1].text == '99'
assert many_unpickled[0].text == "0"
assert many_unpickled[-1].text == "99"
assert len(many_unpickled) == 100
def test_user_data_from_disk():
nlp = Language()
doc = nlp('Hello')
doc = nlp("Hello")
doc.user_data[(0, 1)] = False
b = doc.to_bytes()
doc2 = doc.__class__(doc.vocab).from_bytes(b)
assert doc2.user_data[(0, 1)] == False
def test_user_data_unpickles():
nlp = Language()
doc = nlp('Hello')
doc = nlp("Hello")
doc.user_data[(0, 1)] = False
b = pickle.dumps(doc)
doc2 = pickle.loads(b)
@ -46,10 +47,11 @@ def test_user_data_unpickles():
def test_hooks_unpickle():
def inner_func(d1, d2):
return 'hello!'
return "hello!"
nlp = Language()
doc = nlp('Hello')
doc.user_hooks['similarity'] = inner_func
doc = nlp("Hello")
doc.user_hooks["similarity"] = inner_func
b = pickle.dumps(doc)
doc2 = pickle.loads(b)
assert doc2.similarity(None) == 'hello!'
assert doc2.similarity(None) == "hello!"

View File

@ -11,10 +11,12 @@ from ..util import get_doc
@pytest.fixture
def doc(en_tokenizer):
# fmt: off
text = "This is a sentence. This is another sentence. And a third."
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
'attr', 'punct', 'ROOT', 'det', 'npadvmod', 'punct']
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
# fmt: on
tokens = en_tokenizer(text)
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
@ -39,17 +41,17 @@ def test_spans_sent_spans(doc):
def test_spans_root(doc):
span = doc[2:4]
assert len(span) == 2
assert span.text == 'a sentence'
assert span.root.text == 'sentence'
assert span.root.head.text == 'is'
assert span.text == "a sentence"
assert span.root.text == "sentence"
assert span.root.head.text == "is"
def test_spans_string_fn(doc):
span = doc[0:4]
assert len(span) == 4
assert span.text == 'This is a sentence'
assert span.upper_ == 'THIS IS A SENTENCE'
assert span.lower_ == 'this is a sentence'
assert span.text == "This is a sentence"
assert span.upper_ == "THIS IS A SENTENCE"
assert span.lower_ == "this is a sentence"
def test_spans_root2(en_tokenizer):
@ -57,15 +59,15 @@ def test_spans_root2(en_tokenizer):
heads = [0, 3, -1, -2, -4]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
assert doc[-2:].root.text == 'Carolina'
assert doc[-2:].root.text == "Carolina"
def test_spans_span_sent(doc, doc_not_parsed):
"""Test span.sent property"""
assert len(list(doc.sents))
assert doc[:2].sent.root.text == 'is'
assert doc[:2].sent.text == 'This is a sentence .'
assert doc[6:7].sent.root.left_edge.text == 'This'
assert doc[:2].sent.root.text == "is"
assert doc[:2].sent.text == "This is a sentence ."
assert doc[6:7].sent.root.left_edge.text == "This"
# test on manual sbd
doc_not_parsed[0].is_sent_start = True
doc_not_parsed[5].is_sent_start = True
@ -75,23 +77,23 @@ def test_spans_span_sent(doc, doc_not_parsed):
def test_spans_lca_matrix(en_tokenizer):
"""Test span's lca matrix generation"""
tokens = en_tokenizer('the lazy dog slept')
tokens = en_tokenizer("the lazy dog slept")
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0])
lca = doc[:2].get_lca_matrix()
assert(lca[0, 0] == 0)
assert(lca[0, 1] == -1)
assert(lca[1, 0] == -1)
assert(lca[1, 1] == 1)
assert lca[0, 0] == 0
assert lca[0, 1] == -1
assert lca[1, 0] == -1
assert lca[1, 1] == 1
def test_span_similarity_match():
doc = Doc(Vocab(), words=['a', 'b', 'a', 'b'])
doc = Doc(Vocab(), words=["a", "b", "a", "b"])
span1 = doc[:2]
span2 = doc[2:]
with pytest.warns(None):
assert span1.similarity(span2) == 1.0
assert span1.similarity(doc) == 0.0
assert span1[:1].similarity(doc.vocab['a']) == 1.0
assert span1[:1].similarity(doc.vocab["a"]) == 1.0
def test_spans_default_sentiment(en_tokenizer):
@ -102,8 +104,8 @@ def test_spans_default_sentiment(en_tokenizer):
tokens.vocab[tokens[2].text].sentiment = -2.0
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
assert doc[:2].sentiment == 3.0 / 2
assert doc[-2:].sentiment == -2. / 2
assert doc[:-1].sentiment == (3.+-2) / 3.
assert doc[-2:].sentiment == -2.0 / 2
assert doc[:-1].sentiment == (3.0 + -2) / 3.0
def test_spans_override_sentiment(en_tokenizer):
@ -113,7 +115,7 @@ def test_spans_override_sentiment(en_tokenizer):
tokens.vocab[tokens[0].text].sentiment = 3.0
tokens.vocab[tokens[2].text].sentiment = -2.0
doc = Doc(tokens.vocab, words=[t.text for t in tokens])
doc.user_span_hooks['sentiment'] = lambda span: 10.0
doc.user_span_hooks["sentiment"] = lambda span: 10.0
assert doc[:2].sentiment == 10.0
assert doc[-2:].sentiment == 10.0
assert doc[:-1].sentiment == 10.0
@ -132,10 +134,10 @@ def test_spans_are_hashable(en_tokenizer):
def test_spans_by_character(doc):
span1 = doc[1:-2]
span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE')
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == 'GPE'
assert span2.label_ == "GPE"
def test_span_to_array(doc):
@ -151,12 +153,13 @@ def test_span_as_doc(doc):
span_doc = span.as_doc()
assert span.text == span_doc.text.strip()
def test_span_ents_property(doc):
"""Test span.ents for the """
doc.ents = [
(doc.vocab.strings['PRODUCT'], 0, 1),
(doc.vocab.strings['PRODUCT'], 7, 8),
(doc.vocab.strings['PRODUCT'], 11, 14)
(doc.vocab.strings["PRODUCT"], 0, 1),
(doc.vocab.strings["PRODUCT"], 7, 8),
(doc.vocab.strings["PRODUCT"], 11, 14),
]
assert len(list(doc.ents)) == 3
sentences = list(doc.sents)

View File

@ -13,22 +13,23 @@ def test_spans_merge_tokens(en_tokenizer):
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
assert len(doc) == 4
assert doc[0].head.text == 'Angeles'
assert doc[1].head.text == 'start'
doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', ent_type='GPE')
assert doc[0].head.text == "Angeles"
assert doc[1].head.text == "start"
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", ent_type="GPE")
assert len(doc) == 3
assert doc[0].text == 'Los Angeles'
assert doc[0].head.text == 'start'
assert doc[0].text == "Los Angeles"
assert doc[0].head.text == "start"
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
assert len(doc) == 4
assert doc[0].head.text == 'Angeles'
assert doc[1].head.text == 'start'
doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', label='GPE')
assert doc[0].head.text == "Angeles"
assert doc[1].head.text == "start"
doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", label="GPE")
assert len(doc) == 3
assert doc[0].text == 'Los Angeles'
assert doc[0].head.text == 'start'
assert doc[0].ent_type_ == 'GPE'
assert doc[0].text == "Los Angeles"
assert doc[0].head.text == "start"
assert doc[0].ent_type_ == "GPE"
def test_spans_merge_heads(en_tokenizer):
text = "I found a pilates class near work."
@ -37,8 +38,13 @@ def test_spans_merge_heads(en_tokenizer):
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
assert len(doc) == 8
doc.merge(doc[3].idx, doc[4].idx + len(doc[4]), tag=doc[4].tag_,
lemma='pilates class', ent_type='O')
doc.merge(
doc[3].idx,
doc[4].idx + len(doc[4]),
tag=doc[4].tag_,
lemma="pilates class",
ent_type="O",
)
assert len(doc) == 7
assert doc[0].head.i == 1
assert doc[1].head.i == 1
@ -55,8 +61,9 @@ def test_span_np_merges(en_tokenizer):
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
assert doc[4].head.i == 1
doc.merge(doc[2].idx, doc[4].idx + len(doc[4]), tag='NP', lemma='tool',
ent_type='O')
doc.merge(
doc[2].idx, doc[4].idx + len(doc[4]), tag="NP", lemma="tool", ent_type="O"
)
assert doc[2].head.i == 1
text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript."
@ -69,7 +76,6 @@ def test_span_np_merges(en_tokenizer):
merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label)
assert merged != None, (start, end, label, lemma)
text = "One test with entities like New York City so the ents list is not void"
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
tokens = en_tokenizer(text)
@ -80,15 +86,23 @@ def test_span_np_merges(en_tokenizer):
def test_spans_entity_merge(en_tokenizer):
# fmt: off
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n"
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1]
tags = ['NNP', 'NNP', 'VBZ', 'DT', 'VB', 'RP', 'NN', 'WP', 'VBZ', 'IN', 'NNP', 'CC', 'VBZ', 'NNP', 'NNP', '.', 'SP']
ents = [(0, 2, 'PERSON'), (10, 11, 'GPE'), (13, 15, 'PERSON')]
tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"]
ents = [(0, 2, "PERSON"), (10, 11, "GPE"), (13, 15, "PERSON")]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents
)
assert len(doc) == 17
for ent in doc.ents:
label, lemma, type_ = (ent.root.tag_, ent.root.lemma_, max(w.ent_type_ for w in ent))
label, lemma, type_ = (
ent.root.tag_,
ent.root.lemma_,
max(w.ent_type_ for w in ent),
)
ent.merge(label=label, lemma=lemma, ent_type=type_)
# check looping is ok
assert len(doc) == 15
@ -98,8 +112,10 @@ def test_spans_entity_merge_iob():
# Test entity IOB stays consistent after merging
words = ["a", "b", "c", "d", "e"]
doc = Doc(Vocab(), words=words)
doc.ents = [(doc.vocab.strings.add('ent-abc'), 0, 3),
(doc.vocab.strings.add('ent-d'), 3, 4)]
doc.ents = [
(doc.vocab.strings.add("ent-abc"), 0, 3),
(doc.vocab.strings.add("ent-d"), 3, 4),
]
assert doc[0].ent_iob_ == "B"
assert doc[1].ent_iob_ == "I"
assert doc[2].ent_iob_ == "I"
@ -110,33 +126,37 @@ def test_spans_entity_merge_iob():
def test_spans_sentence_update_after_merge(en_tokenizer):
# fmt: off
text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale."
heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7]
deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj',
'compound', 'dobj', 'punct']
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
sent1, sent2 = list(doc.sents)
init_len = len(sent1)
init_len2 = len(sent2)
doc[0:2].merge(label='none', lemma='none', ent_type='none')
doc[-2:].merge(label='none', lemma='none', ent_type='none')
doc[0:2].merge(label="none", lemma="none", ent_type="none")
doc[-2:].merge(label="none", lemma="none", ent_type="none")
assert len(sent1) == init_len - 1
assert len(sent2) == init_len2 - 1
def test_spans_subtree_size_check(en_tokenizer):
# fmt: off
text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale"
heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2]
deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr',
'nsubj', 'relcl', 'prep', 'pobj', 'cc', 'conj', 'compound',
'dobj']
deps = ["compound", "nsubj", "ROOT", "det", "amod", "prt", "attr",
"nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound",
"dobj"]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
sent1 = list(doc.sents)[0]
init_len = len(list(sent1.root.subtree))
doc[0:2].merge(label='none', lemma='none', ent_type='none')
doc[0:2].merge(label="none", lemma="none", ent_type="none")
assert len(list(sent1.root.subtree)) == init_len - 1

View File

@ -13,31 +13,35 @@ from ..util import get_doc
@pytest.fixture
def doc(en_tokenizer):
# fmt: off
text = "This is a sentence. This is another sentence. And a third."
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1]
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
'attr', 'punct', 'ROOT', 'det', 'npadvmod', 'punct']
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
"attr", "punct", "ROOT", "det", "npadvmod", "punct"]
# fmt: on
tokens = en_tokenizer(text)
return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
def test_doc_token_api_strings(en_tokenizer):
text = "Give it back! He pleaded."
pos = ['VERB', 'PRON', 'PART', 'PUNCT', 'PRON', 'VERB', 'PUNCT']
pos = ["VERB", "PRON", "PART", "PUNCT", "PRON", "VERB", "PUNCT"]
heads = [0, -1, -2, -3, 1, 0, -1]
deps = ['ROOT', 'dobj', 'prt', 'punct', 'nsubj', 'ROOT', 'punct']
deps = ["ROOT", "dobj", "prt", "punct", "nsubj", "ROOT", "punct"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps)
assert doc[0].orth_ == 'Give'
assert doc[0].text == 'Give'
assert doc[0].text_with_ws == 'Give '
assert doc[0].lower_ == 'give'
assert doc[0].shape_ == 'Xxxx'
assert doc[0].prefix_ == 'G'
assert doc[0].suffix_ == 'ive'
assert doc[0].pos_ == 'VERB'
assert doc[0].dep_ == 'ROOT'
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps
)
assert doc[0].orth_ == "Give"
assert doc[0].text == "Give"
assert doc[0].text_with_ws == "Give "
assert doc[0].lower_ == "give"
assert doc[0].shape_ == "Xxxx"
assert doc[0].prefix_ == "G"
assert doc[0].suffix_ == "ive"
assert doc[0].pos_ == "VERB"
assert doc[0].dep_ == "ROOT"
def test_doc_token_api_flags(en_tokenizer):
@ -53,7 +57,7 @@ def test_doc_token_api_flags(en_tokenizer):
# TODO: Test more of these, esp. if a bug is found
@pytest.mark.parametrize('text', ["Give it back! He pleaded."])
@pytest.mark.parametrize("text", ["Give it back! He pleaded."])
def test_doc_token_api_prob_inherited_from_vocab(en_tokenizer, text):
word = text.split()[0]
en_tokenizer.vocab[word].prob = -1
@ -61,11 +65,11 @@ def test_doc_token_api_prob_inherited_from_vocab(en_tokenizer, text):
assert tokens[0].prob != 0
@pytest.mark.parametrize('text', ["one two"])
@pytest.mark.parametrize("text", ["one two"])
def test_doc_token_api_str_builtin(en_tokenizer, text):
tokens = en_tokenizer(text)
assert str(tokens[0]) == text.split(' ')[0]
assert str(tokens[1]) == text.split(' ')[1]
assert str(tokens[0]) == text.split(" ")[0]
assert str(tokens[1]) == text.split(" ")[1]
def test_doc_token_api_is_properties(en_vocab):
@ -83,16 +87,16 @@ def test_doc_token_api_is_properties(en_vocab):
def test_doc_token_api_vectors():
vocab = Vocab()
vocab.reset_vectors(width=2)
vocab.set_vector('apples', vector=numpy.asarray([0., 2.], dtype='f'))
vocab.set_vector('oranges', vector=numpy.asarray([0., 1.], dtype='f'))
doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
vocab.set_vector("apples", vector=numpy.asarray([0.0, 2.0], dtype="f"))
vocab.set_vector("oranges", vector=numpy.asarray([0.0, 1.0], dtype="f"))
doc = Doc(vocab, words=["apples", "oranges", "oov"])
assert doc.has_vector
assert doc[0].has_vector
assert doc[1].has_vector
assert not doc[2].has_vector
apples_norm = (0*0 + 2*2) ** 0.5
oranges_norm = (0*0 + 1*1) ** 0.5
cosine = ((0*0) + (2*1)) / (apples_norm * oranges_norm)
apples_norm = (0 * 0 + 2 * 2) ** 0.5
oranges_norm = (0 * 0 + 1 * 1) ** 0.5
cosine = ((0 * 0) + (2 * 1)) / (apples_norm * oranges_norm)
assert doc[0].similarity(doc[1]) == cosine
@ -165,7 +169,7 @@ def test_doc_token_api_head_setter(en_tokenizer):
def test_is_sent_start(en_tokenizer):
doc = en_tokenizer('This is a sentence. This is another.')
doc = en_tokenizer("This is a sentence. This is another.")
assert doc[5].is_sent_start is None
doc[5].is_sent_start = True
assert doc[5].is_sent_start is True
@ -174,17 +178,17 @@ def test_is_sent_start(en_tokenizer):
def test_set_pos():
doc = Doc(Vocab(), words=['hello', 'world'])
doc[0].pos_ = 'NOUN'
assert doc[0].pos_ == 'NOUN'
doc = Doc(Vocab(), words=["hello", "world"])
doc[0].pos_ = "NOUN"
assert doc[0].pos_ == "NOUN"
doc[1].pos = VERB
assert doc[1].pos_ == 'VERB'
assert doc[1].pos_ == "VERB"
def test_tokens_sent(doc):
"""Test token.sent property"""
assert len(list(doc.sents)) == 3
assert doc[1].sent.text == 'This is a sentence .'
assert doc[7].sent.text == 'This is another sentence .'
assert doc[1].sent.root.left_edge.text == 'This'
assert doc[7].sent.root.left_edge.text == 'This'
assert doc[1].sent.text == "This is a sentence ."
assert doc[7].sent.text == "This is another sentence ."
assert doc[1].sent.root.left_edge.text == "This"
assert doc[7].sent.root.left_edge.text == "This"

View File

@ -20,7 +20,7 @@ def test_doc_underscore_getattr_setattr():
doc = Mock()
doc.doc = doc
doc.user_data = {}
Underscore.doc_extensions['hello'] = (False, None, None, None)
Underscore.doc_extensions["hello"] = (False, None, None, None)
doc._ = Underscore(Underscore.doc_extensions, doc)
assert doc._.hello == False
doc._.hello = True
@ -29,8 +29,9 @@ def test_doc_underscore_getattr_setattr():
def test_create_span_underscore():
span = Mock(doc=Mock(), start=0, end=2)
uscore = Underscore(Underscore.span_extensions, span,
start=span.start, end=span.end)
uscore = Underscore(
Underscore.span_extensions, span, start=span.start, end=span.end
)
assert uscore._doc is span.doc
assert uscore._start is span.start
assert uscore._end is span.end
@ -38,60 +39,70 @@ def test_create_span_underscore():
def test_span_underscore_getter_setter():
span = Mock(doc=Mock(), start=0, end=2)
Underscore.span_extensions['hello'] = (None, None,
lambda s: (s.start, 'hi'),
lambda s, value: setattr(s, 'start',
value))
span._ = Underscore(Underscore.span_extensions, span,
start=span.start, end=span.end)
Underscore.span_extensions["hello"] = (
None,
None,
lambda s: (s.start, "hi"),
lambda s, value: setattr(s, "start", value),
)
span._ = Underscore(
Underscore.span_extensions, span, start=span.start, end=span.end
)
assert span._.hello == (0, 'hi')
assert span._.hello == (0, "hi")
span._.hello = 1
assert span._.hello == (1, 'hi')
assert span._.hello == (1, "hi")
def test_token_underscore_method():
token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese')
Underscore.token_extensions['hello'] = (None, token.say_cheese,
None, None)
token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: "cheese")
Underscore.token_extensions["hello"] = (None, token.say_cheese, None, None)
token._ = Underscore(Underscore.token_extensions, token, start=token.idx)
assert token._.hello() == 'cheese'
assert token._.hello() == "cheese"
@pytest.mark.parametrize('obj', [Doc, Span, Token])
@pytest.mark.parametrize("obj", [Doc, Span, Token])
def test_doc_underscore_remove_extension(obj):
ext_name = 'to_be_removed'
ext_name = "to_be_removed"
obj.set_extension(ext_name, default=False)
assert obj.has_extension(ext_name)
obj.remove_extension(ext_name)
assert not obj.has_extension(ext_name)
@pytest.mark.parametrize('obj', [Doc, Span, Token])
@pytest.mark.parametrize("obj", [Doc, Span, Token])
def test_underscore_raises_for_dup(obj):
obj.set_extension('test', default=None)
obj.set_extension("test", default=None)
with pytest.raises(ValueError):
obj.set_extension('test', default=None)
obj.set_extension("test", default=None)
@pytest.mark.parametrize('invalid_kwargs', [
{'getter': None, 'setter': lambda: None},
{'default': None, 'method': lambda: None, 'getter': lambda: None},
{'setter': lambda: None},
{'default': None, 'method': lambda: None},
{'getter': True}])
@pytest.mark.parametrize(
"invalid_kwargs",
[
{"getter": None, "setter": lambda: None},
{"default": None, "method": lambda: None, "getter": lambda: None},
{"setter": lambda: None},
{"default": None, "method": lambda: None},
{"getter": True},
],
)
def test_underscore_raises_for_invalid(invalid_kwargs):
invalid_kwargs['force'] = True
invalid_kwargs["force"] = True
with pytest.raises(ValueError):
Doc.set_extension('test', **invalid_kwargs)
Doc.set_extension("test", **invalid_kwargs)
@pytest.mark.parametrize('valid_kwargs', [
{'getter': lambda: None},
{'getter': lambda: None, 'setter': lambda: None},
{'default': 'hello'},
{'default': None},
{'method': lambda: None}])
@pytest.mark.parametrize(
"valid_kwargs",
[
{"getter": lambda: None},
{"getter": lambda: None, "setter": lambda: None},
{"default": "hello"},
{"default": None},
{"method": lambda: None},
],
)
def test_underscore_accepts_valid(valid_kwargs):
valid_kwargs['force'] = True
Doc.set_extension('test', **valid_kwargs)
valid_kwargs["force"] = True
Doc.set_extension("test", **valid_kwargs)

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["ق.م", "إلخ", "ص.ب", "ت."])
@pytest.mark.parametrize("text", ["ق.م", "إلخ", "ص.ب", "ت."])
def test_ar_tokenizer_handles_abbr(ar_tokenizer, text):
tokens = ar_tokenizer(text)
assert len(tokens) == 1
@ -18,7 +18,7 @@ def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer):
assert tokens[6].lemma_ == "قبل الميلاد"
def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer):
def test_ar_tokenizer_handles_exc_in_text_2(ar_tokenizer):
text = "يبلغ طول مضيق طارق 14كم "
tokens = ar_tokenizer(text)
assert len(tokens) == 6

View File

@ -6,16 +6,22 @@ import pytest
TESTCASES = [
# punctuation tests
('আমি বাংলায় গান গাই!', ['আমি', 'বাংলায়', 'গান', 'গাই', '!']),
('আমি বাংলায় কথা কই।', ['আমি', 'বাংলায়', 'কথা', 'কই', '']),
('বসুন্ধরা জনসম্মুখে দোষ স্বীকার করলো না?', ['বসুন্ধরা', 'জনসম্মুখে', 'দোষ', 'স্বীকার', 'করলো', 'না', '?']),
('টাকা থাকলে কি না হয়!', ['টাকা', 'থাকলে', 'কি', 'না', 'হয়', '!']),
("আমি বাংলায় গান গাই!", ["আমি", "বাংলায়", "গান", "গাই", "!"]),
("আমি বাংলায় কথা কই।", ["আমি", "বাংলায়", "কথা", "কই", ""]),
(
"বসুন্ধরা জনসম্মুখে দোষ স্বীকার করলো না?",
["বসুন্ধরা", "জনসম্মুখে", "দোষ", "স্বীকার", "করলো", "না", "?"],
),
("টাকা থাকলে কি না হয়!", ["টাকা", "থাকলে", "কি", "না", "হয়", "!"]),
# abbreviations
('ডঃ খালেদ বললেন ঢাকায় ৩৫ ডিগ্রি সে.।', ['ডঃ', 'খালেদ', 'বললেন', 'ঢাকায়', '৩৫', 'ডিগ্রি', 'সে.', ''])
(
"ডঃ খালেদ বললেন ঢাকায় ৩৫ ডিগ্রি সে.।",
["ডঃ", "খালেদ", "বললেন", "ঢাকায়", "৩৫", "ডিগ্রি", "সে.", ""],
),
]
@pytest.mark.parametrize('text,expected_tokens', TESTCASES)
@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
def test_bn_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens):
tokens = bn_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]

View File

@ -4,19 +4,19 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["ca.", "m.a.o.", "Jan.", "Dec.", "kr.", "jf."])
@pytest.mark.parametrize("text", ["ca.", "m.a.o.", "Jan.", "Dec.", "kr.", "jf."])
def test_da_tokenizer_handles_abbr(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["Jul.", "jul.", "Tor.", "Tors."])
@pytest.mark.parametrize("text", ["Jul.", "jul.", "Tor.", "Tors."])
def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["1.", "10.", "31."])
@pytest.mark.parametrize("text", ["1.", "10.", "31."])
def test_da_tokenizer_handles_dates(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 1
@ -37,8 +37,9 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
assert tokens[7].text == "."
@pytest.mark.parametrize('text,norm', [
("akvarium", "akvarie"), ("bedstemoder", "bedstemor")])
@pytest.mark.parametrize(
"text,norm", [("akvarium", "akvarie"), ("bedstemoder", "bedstemor")]
)
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
tokens = da_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -4,11 +4,15 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('string,lemma', [
('affaldsgruppernes', 'affaldsgruppe'),
('detailhandelsstrukturernes', 'detailhandelsstruktur'),
('kolesterols', 'kolesterol'),
('åsyns', 'åsyn')])
@pytest.mark.parametrize(
"string,lemma",
[
("affaldsgruppernes", "affaldsgruppe"),
("detailhandelsstrukturernes", "detailhandelsstruktur"),
("kolesterols", "kolesterol"),
("åsyns", "åsyn"),
],
)
def test_da_lemmatizer_lookup_assigns(da_tokenizer, string, lemma):
tokens = da_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -4,19 +4,19 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["(under)"])
@pytest.mark.parametrize("text", ["(under)"])
def test_da_tokenizer_splits_no_special(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["ta'r", "Søren's", "Lars'"])
@pytest.mark.parametrize("text", ["ta'r", "Søren's", "Lars'"])
def test_da_tokenizer_handles_no_punct(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["(ta'r"])
@pytest.mark.parametrize("text", ["(ta'r"])
def test_da_tokenizer_splits_prefix_punct(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 2
@ -24,7 +24,7 @@ def test_da_tokenizer_splits_prefix_punct(da_tokenizer, text):
assert tokens[1].text == "ta'r"
@pytest.mark.parametrize('text', ["ta'r)"])
@pytest.mark.parametrize("text", ["ta'r)"])
def test_da_tokenizer_splits_suffix_punct(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 2
@ -32,15 +32,16 @@ def test_da_tokenizer_splits_suffix_punct(da_tokenizer, text):
assert tokens[1].text == ")"
@pytest.mark.parametrize('text,expected', [
("(ta'r)", ["(", "ta'r", ")"]), ("'ta'r'", ["'", "ta'r", "'"])])
@pytest.mark.parametrize(
"text,expected", [("(ta'r)", ["(", "ta'r", ")"]), ("'ta'r'", ["'", "ta'r", "'"])]
)
def test_da_tokenizer_splits_even_wrap(da_tokenizer, text, expected):
tokens = da_tokenizer(text)
assert len(tokens) == len(expected)
assert [t.text for t in tokens] == expected
@pytest.mark.parametrize('text', ["(ta'r?)"])
@pytest.mark.parametrize("text", ["(ta'r?)"])
def test_da_tokenizer_splits_uneven_wrap(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 4
@ -50,15 +51,17 @@ def test_da_tokenizer_splits_uneven_wrap(da_tokenizer, text):
assert tokens[3].text == ")"
@pytest.mark.parametrize('text,expected', [
("f.eks.", ["f.eks."]), ("fe.", ["fe", "."]), ("(f.eks.", ["(", "f.eks."])])
@pytest.mark.parametrize(
"text,expected",
[("f.eks.", ["f.eks."]), ("fe.", ["fe", "."]), ("(f.eks.", ["(", "f.eks."])],
)
def test_da_tokenizer_splits_prefix_interact(da_tokenizer, text, expected):
tokens = da_tokenizer(text)
assert len(tokens) == len(expected)
assert [t.text for t in tokens] == expected
@pytest.mark.parametrize('text', ["f.eks.)"])
@pytest.mark.parametrize("text", ["f.eks.)"])
def test_da_tokenizer_splits_suffix_interact(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 2
@ -66,7 +69,7 @@ def test_da_tokenizer_splits_suffix_interact(da_tokenizer, text):
assert tokens[1].text == ")"
@pytest.mark.parametrize('text', ["(f.eks.)"])
@pytest.mark.parametrize("text", ["(f.eks.)"])
def test_da_tokenizer_splits_even_wrap_interact(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 3
@ -75,7 +78,7 @@ def test_da_tokenizer_splits_even_wrap_interact(da_tokenizer, text):
assert tokens[2].text == ")"
@pytest.mark.parametrize('text', ["(f.eks.?)"])
@pytest.mark.parametrize("text", ["(f.eks.?)"])
def test_da_tokenizer_splits_uneven_wrap_interact(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 4
@ -85,19 +88,19 @@ def test_da_tokenizer_splits_uneven_wrap_interact(da_tokenizer, text):
assert tokens[3].text == ")"
@pytest.mark.parametrize('text', ["0,1-13,5", "0,0-0,1", "103,27-300", "1/2-3/4"])
@pytest.mark.parametrize("text", ["0,1-13,5", "0,0-0,1", "103,27-300", "1/2-3/4"])
def test_da_tokenizer_handles_numeric_range(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["sort.Gul", "Hej.Verden"])
@pytest.mark.parametrize("text", ["sort.Gul", "Hej.Verden"])
def test_da_tokenizer_splits_period_infix(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Hej,Verden", "en,to"])
@pytest.mark.parametrize("text", ["Hej,Verden", "en,to"])
def test_da_tokenizer_splits_comma_infix(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 3
@ -106,20 +109,25 @@ def test_da_tokenizer_splits_comma_infix(da_tokenizer, text):
assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["sort...Gul", "sort...gul"])
@pytest.mark.parametrize("text", ["sort...Gul", "sort...gul"])
def test_da_tokenizer_splits_ellipsis_infix(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ['gå-på-mod', '4-hjulstræk', '100-Pfennig-frimærke', 'TV-2-spots', 'trofæ-vaeggen'])
@pytest.mark.parametrize(
"text",
["gå-på-mod", "4-hjulstræk", "100-Pfennig-frimærke", "TV-2-spots", "trofæ-vaeggen"],
)
def test_da_tokenizer_keeps_hyphens(da_tokenizer, text):
tokens = da_tokenizer(text)
assert len(tokens) == 1
def test_da_tokenizer_splits_double_hyphen_infix(da_tokenizer):
tokens = da_tokenizer("Mange regler--eksempelvis bindestregs-reglerne--er komplicerede.")
tokens = da_tokenizer(
"Mange regler--eksempelvis bindestregs-reglerne--er komplicerede."
)
assert len(tokens) == 9
assert tokens[0].text == "Mange"
assert tokens[1].text == "regler"
@ -132,7 +140,9 @@ def test_da_tokenizer_splits_double_hyphen_infix(da_tokenizer):
def test_da_tokenizer_handles_posessives_and_contractions(da_tokenizer):
tokens = da_tokenizer("'DBA's, Lars' og Liz' bil sku' sgu' ik' ha' en bule, det ka' han ik' li' mere', sagde hun.")
tokens = da_tokenizer(
"'DBA's, Lars' og Liz' bil sku' sgu' ik' ha' en bule, det ka' han ik' li' mere', sagde hun."
)
assert len(tokens) == 25
assert tokens[0].text == "'"
assert tokens[1].text == "DBA's"

View File

@ -15,17 +15,29 @@ Rundt om ager og eng var der store skove, og midt i skovene dybe søer; jo, der
assert len(tokens) == 84
@pytest.mark.parametrize('text,match', [
('10', True), ('1', True), ('10.000', True), ('10.00', True),
('999,0', True), ('en', True), ('treoghalvfemsindstyvende', True), ('hundrede', True),
('hund', False), (',', False), ('1/2', True)])
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("10.000", True),
("10.00", True),
("999,0", True),
("en", True),
("treoghalvfemsindstyvende", True),
("hundrede", True),
("hund", False),
(",", False),
("1/2", True),
],
)
def test_lex_attrs_like_number(da_tokenizer, text, match):
tokens = da_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match
@pytest.mark.parametrize('word', ['elleve', 'første'])
@pytest.mark.parametrize("word", ["elleve", "første"])
def test_da_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -4,13 +4,13 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"])
@pytest.mark.parametrize("text", ["auf'm", "du's", "über'm", "wir's"])
def test_de_tokenizer_splits_contractions(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
@pytest.mark.parametrize("text", ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."])
def test_de_tokenizer_handles_abbr(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 1
@ -24,14 +24,16 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer):
assert tokens[2].lemma_ == "zur Zeit"
@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])])
@pytest.mark.parametrize(
"text,norms", [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])]
)
def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms):
tokens = de_tokenizer(text)
assert [token.norm_ for token in tokens] == norms
@pytest.mark.xfail
@pytest.mark.parametrize('text,norm', [("daß", "dass")])
@pytest.mark.parametrize("text,norm", [("daß", "dass")])
def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm):
tokens = de_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -4,13 +4,17 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('string,lemma', [
('Abgehängten', 'Abgehängte'),
('engagierte', 'engagieren'),
('schließt', 'schließen'),
('vorgebenden', 'vorgebend'),
('die', 'der'),
('Die', 'der')])
@pytest.mark.parametrize(
"string,lemma",
[
("Abgehängten", "Abgehängte"),
("engagierte", "engagieren"),
("schließt", "schließen"),
("vorgebenden", "vorgebend"),
("die", "der"),
("Die", "der"),
],
)
def test_de_lemmatizer_lookup_assigns(de_tokenizer, string, lemma):
tokens = de_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -7,10 +7,12 @@ from ...util import get_doc
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
text = "Eine Tasse steht auf dem Tisch."
heads = [1, 1, 0, -1, 1, -2, -4]
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
tags = ["ART", "NN", "VVFIN", "APPR", "ART", "NN", "$."]
deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "punct"]
tokens = de_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
)
chunks = list(doc.noun_chunks)
assert len(chunks) == 2
assert chunks[0].text_with_ws == "Eine Tasse "
@ -20,10 +22,12 @@ def test_de_parser_noun_chunks_standard_de(de_tokenizer):
def test_de_extended_chunk(de_tokenizer):
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
tags = ["ART", "NN", "VVFIN", "APPR", "ART", "NN", "NN", "NN", "$."]
deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "nk", "oa", "punct"]
tokens = de_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
)
chunks = list(doc.noun_chunks)
assert len(chunks) == 3
assert chunks[0].text_with_ws == "Die Sängerin "

View File

@ -4,79 +4,79 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["(unter)"])
@pytest.mark.parametrize("text", ["(unter)"])
def test_de_tokenizer_splits_no_special(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["unter'm"])
@pytest.mark.parametrize("text", ["unter'm"])
def test_de_tokenizer_splits_no_punct(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(unter'm"])
@pytest.mark.parametrize("text", ["(unter'm"])
def test_de_tokenizer_splits_prefix_punct(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["unter'm)"])
@pytest.mark.parametrize("text", ["unter'm)"])
def test_de_tokenizer_splits_suffix_punct(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(unter'm)"])
@pytest.mark.parametrize("text", ["(unter'm)"])
def test_de_tokenizer_splits_even_wrap(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text', ["(unter'm?)"])
@pytest.mark.parametrize("text", ["(unter'm?)"])
def test_de_tokenizer_splits_uneven_wrap(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 5
@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
@pytest.mark.parametrize("text,length", [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
def test_de_tokenizer_splits_prefix_interact(de_tokenizer, text, length):
tokens = de_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text', ["z.B.)"])
@pytest.mark.parametrize("text", ["z.B.)"])
def test_de_tokenizer_splits_suffix_interact(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(z.B.)"])
@pytest.mark.parametrize("text", ["(z.B.)"])
def test_de_tokenizer_splits_even_wrap_interact(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(z.B.?)"])
@pytest.mark.parametrize("text", ["(z.B.?)"])
def test_de_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
def test_de_tokenizer_splits_numeric_range(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"])
@pytest.mark.parametrize("text", ["blau.Rot", "Hallo.Welt"])
def test_de_tokenizer_splits_period_infix(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"])
@pytest.mark.parametrize("text", ["Hallo,Welt", "eins,zwei"])
def test_de_tokenizer_splits_comma_infix(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@ -85,13 +85,13 @@ def test_de_tokenizer_splits_comma_infix(de_tokenizer, text):
assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"])
@pytest.mark.parametrize("text", ["blau...Rot", "blau...rot"])
def test_de_tokenizer_splits_ellipsis_infix(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt'])
@pytest.mark.parametrize("text", ["Islam-Konferenz", "Ost-West-Konflikt"])
def test_de_tokenizer_keeps_hyphens(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 1

View File

@ -22,19 +22,27 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen.
assert len(tokens) == 109
@pytest.mark.parametrize('text', [
@pytest.mark.parametrize(
"text",
[
"Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten",
"Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz",
"Kraftfahrzeug-Haftpflichtversicherung",
"Vakuum-Mittelfrequenz-Induktionsofen"])
"Vakuum-Mittelfrequenz-Induktionsofen",
],
)
def test_de_tokenizer_handles_long_words(de_tokenizer, text):
tokens = de_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text,length', [
@pytest.mark.parametrize(
"text,length",
[
("»Was ist mit mir geschehen?«, dachte er.", 12),
("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15)])
("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15),
],
)
def test_de_tokenizer_handles_examples(de_tokenizer, text, length):
tokens = de_tokenizer(text)
assert len(tokens) == length

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["αριθ.", "τρισ.", "δισ.", "σελ."])
@pytest.mark.parametrize("text", ["αριθ.", "τρισ.", "δισ.", "σελ."])
def test_el_tokenizer_handles_abbr(el_tokenizer, text):
tokens = el_tokenizer(text)
assert len(tokens) == 1

View File

@ -13,12 +13,22 @@ def test_el_tokenizer_handles_long_text(el_tokenizer):
assert len(tokens) == 54
@pytest.mark.parametrize('text,length',[
@pytest.mark.parametrize(
"text,length",
[
("Διοικητικά η Ελλάδα διαιρείται σε 13 Περιφέρειες.", 8),
("Η εκπαίδευση στην Ελλάδα χωρίζεται κυρίως σε τρία επίπεδα.", 10),
("Η Ελλάδα είναι μία από τις χώρες της Ευρωπαϊκής Ένωσης (ΕΕ) που διαθέτει σηµαντικό ορυκτό πλούτο.", 19),
("Η ναυτιλία αποτέλεσε ένα σημαντικό στοιχείο της Ελληνικής οικονομικής δραστηριότητας από τα αρχαία χρόνια.", 15),
("Η Ελλάδα είναι μέλος σε αρκετούς διεθνείς οργανισμούς.", 9)])
def test_el_tokenizer_handles_cnts(el_tokenizer,text, length):
(
"Η Ελλάδα είναι μία από τις χώρες της Ευρωπαϊκής Ένωσης (ΕΕ) που διαθέτει σηµαντικό ορυκτό πλούτο.",
19,
),
(
"Η ναυτιλία αποτέλεσε ένα σημαντικό στοιχείο της Ελληνικής οικονομικής δραστηριότητας από τα αρχαία χρόνια.",
15,
),
("Η Ελλάδα είναι μέλος σε αρκετούς διεθνείς οργανισμούς.", 9),
],
)
def test_el_tokenizer_handles_cnts(el_tokenizer, text, length):
tokens = el_tokenizer(text)
assert len(tokens) == length

View File

@ -12,29 +12,66 @@ from spacy.util import compile_infix_regex
def custom_en_tokenizer(en_vocab):
prefix_re = compile_prefix_regex(English.Defaults.prefixes)
suffix_re = compile_suffix_regex(English.Defaults.suffixes)
custom_infixes = ['\.\.\.+',
'(?<=[0-9])-(?=[0-9])',
custom_infixes = [
"\.\.\.+",
"(?<=[0-9])-(?=[0-9])",
# '(?<=[0-9]+),(?=[0-9]+)',
'[0-9]+(,[0-9]+)+',
'[\[\]!&:,()\*—–\/-]']
"[0-9]+(,[0-9]+)+",
"[\[\]!&:,()\*—–\/-]",
]
infix_re = compile_infix_regex(custom_infixes)
return Tokenizer(en_vocab,
return Tokenizer(
en_vocab,
English.Defaults.tokenizer_exceptions,
prefix_re.search,
suffix_re.search,
infix_re.finditer,
token_match=None)
token_match=None,
)
def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion."
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == ['The', '8', 'and', '10', '-', 'county', 'definitions',
'are', 'not', 'used', 'for', 'the', 'greater',
'Southern', 'California', 'Megaregion', '.']
assert context == [
"The",
"8",
"and",
"10",
"-",
"county",
"definitions",
"are",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
]
# the trailing '-' may cause Assertion Error
sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion."
context = [word.text for word in custom_en_tokenizer(sentence)]
assert context == ['The', '8', '-', 'and', '10', '-', 'county',
'definitions', 'are', 'not', 'used', 'for', 'the',
'greater', 'Southern', 'California', 'Megaregion', '.']
assert context == [
"The",
"8",
"-",
"and",
"10",
"-",
"county",
"definitions",
"are",
"not",
"used",
"for",
"the",
"greater",
"Southern",
"California",
"Megaregion",
".",
]

View File

@ -15,13 +15,15 @@ def test_en_tokenizer_handles_basic_contraction(en_tokenizer):
assert tokens[4].text == "!"
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
@pytest.mark.parametrize("text", ["`ain't", """"isn't""", "can't!"])
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
@pytest.mark.parametrize(
"text_poss,text", [("Robin's", "Robin"), ("Alexis's", "Alexis")]
)
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
tokens = en_tokenizer(text_poss)
assert len(tokens) == 2
@ -29,7 +31,7 @@ def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
assert tokens[1].text == "'s"
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
@pytest.mark.parametrize("text", ["schools'", "Alexis'"])
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
@ -37,14 +39,14 @@ def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
assert tokens[1].text == "'"
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
@pytest.mark.parametrize("text", ["'em", "nothin'", "ol'"])
def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].text == text
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"])
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
@ -53,7 +55,9 @@ def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
assert tokens[1].lemma_ == "will"
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
@pytest.mark.parametrize(
"text_lower,text_title", [("can't", "Can't"), ("ain't", "Ain't")]
)
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
tokens_lower = en_tokenizer(text_lower)
tokens_title = en_tokenizer(text_title)
@ -62,21 +66,23 @@ def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_titl
assert tokens_lower[1].text == tokens_title[1].text
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
@pytest.mark.parametrize("pron", ["I", "You", "He", "She", "It", "We", "They"])
@pytest.mark.parametrize("contraction", ["'ll", "'d"])
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
tokens = en_tokenizer(pron + contraction)
assert tokens[0].text == pron
assert tokens[1].text == contraction
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
@pytest.mark.parametrize("exc", ["Ill", "ill", "Hell", "hell", "Well", "well"])
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
tokens = en_tokenizer(exc)
assert len(tokens) == 1
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
@pytest.mark.parametrize(
"wo_punct,w_punct", [("We've", "``We've"), ("couldn't", "couldn't)")]
)
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
tokens = en_tokenizer(wo_punct)
assert len(tokens) == 2
@ -84,7 +90,7 @@ def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
@pytest.mark.parametrize("text", ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
@ -97,20 +103,24 @@ def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
assert tokens[3].text == "i.e."
@pytest.mark.parametrize('text', ["1am", "12a.m.", "11p.m.", "4pm"])
@pytest.mark.parametrize("text", ["1am", "12a.m.", "11p.m.", "4pm"])
def test_en_tokenizer_handles_times(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[1].lemma_ in ["a.m.", "p.m."]
@pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])])
@pytest.mark.parametrize(
"text,norms", [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])]
)
def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms):
tokens = en_tokenizer(text)
assert [token.norm_ for token in tokens] == norms
@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")])
@pytest.mark.parametrize(
"text,norm", [("radicalised", "radicalized"), ("cuz", "because")]
)
def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm):
tokens = en_tokenizer(text)
assert tokens[0].norm_ == norm

View File

@ -12,14 +12,25 @@ from ...util import get_doc
def test_en_noun_chunks_not_nested(en_tokenizer):
text = "Peter has chronic command and control issues"
heads = [1, 0, 4, 3, -1, -2, -5]
deps = ['nsubj', 'ROOT', 'amod', 'nmod', 'cc', 'conj', 'dobj']
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
tokens.from_array(
[HEAD, DEP],
numpy.asarray([[1, nsubj], [0, root], [4, amod], [3, nmod], [-1, cc],
[-2, conj], [-5, dobj]], dtype='uint64'))
tokens.noun_chunks_iterator = SYNTAX_ITERATORS['noun_chunks']
numpy.asarray(
[
[1, nsubj],
[0, root],
[4, amod],
[3, nmod],
[-1, cc],
[-2, conj],
[-5, dobj],
],
dtype="uint64",
),
)
tokens.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"]
word_occurred = {}
for chunk in tokens.noun_chunks:
for word in chunk:

View File

@ -7,22 +7,28 @@ from ...util import get_doc
def test_en_parser_noun_chunks_standard(en_tokenizer):
text = "A base phrase should be recognized."
heads = [2, 1, 3, 2, 1, 0, -1]
tags = ['DT', 'JJ', 'NN', 'MD', 'VB', 'VBN', '.']
deps = ['det', 'amod', 'nsubjpass', 'aux', 'auxpass', 'ROOT', 'punct']
tags = ["DT", "JJ", "NN", "MD", "VB", "VBN", "."]
deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
)
chunks = list(doc.noun_chunks)
assert len(chunks) == 1
assert chunks[0].text_with_ws == "A base phrase "
def test_en_parser_noun_chunks_coordinated(en_tokenizer):
# fmt: off
text = "A base phrase and a good phrase are often the same."
heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
tags = ['DT', 'NN', 'NN', 'CC', 'DT', 'JJ', 'NN', 'VBP', 'RB', 'DT', 'JJ', '.']
deps = ['det', 'compound', 'nsubj', 'cc', 'det', 'amod', 'conj', 'ROOT', 'advmod', 'det', 'attr', 'punct']
tags = ["DT", "NN", "NN", "CC", "DT", "JJ", "NN", "VBP", "RB", "DT", "JJ", "."]
deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
)
chunks = list(doc.noun_chunks)
assert len(chunks) == 2
assert chunks[0].text_with_ws == "A base phrase "
@ -32,10 +38,12 @@ def test_en_parser_noun_chunks_coordinated(en_tokenizer):
def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
text = "A phrase with another phrase occurs."
heads = [1, 4, -1, 1, -2, 0, -1]
tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ', '.']
deps = ['det', 'nsubj', 'prep', 'det', 'pobj', 'ROOT', 'punct']
tags = ["DT", "NN", "IN", "DT", "NN", "VBZ", "."]
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
)
chunks = list(doc.noun_chunks)
assert len(chunks) == 2
assert chunks[0].text_with_ws == "A phrase "
@ -43,12 +51,16 @@ def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
# fmt: off
text = "Sam, my brother, arrived to the house."
heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
tags = ['NNP', ',', 'PRP$', 'NN', ',', 'VBD', 'IN', 'DT', 'NN', '.']
deps = ['nsubj', 'punct', 'poss', 'appos', 'punct', 'ROOT', 'prep', 'det', 'pobj', 'punct']
tags = ["NNP", ",", "PRP$", "NN", ",", "VBD", "IN", "DT", "NN", "."]
deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
)
chunks = list(doc.noun_chunks)
assert len(chunks) == 3
assert chunks[0].text_with_ws == "Sam "
@ -59,10 +71,12 @@ def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
def test_en_parser_noun_chunks_dative(en_tokenizer):
text = "She gave Bob a raise."
heads = [1, 0, -1, 1, -3, -4]
tags = ['PRP', 'VBD', 'NNP', 'DT', 'NN', '.']
deps = ['nsubj', 'ROOT', 'dative', 'det', 'dobj', 'punct']
tags = ["PRP", "VBD", "NNP", "DT", "NN", "."]
deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads
)
chunks = list(doc.noun_chunks)
assert len(chunks) == 3
assert chunks[0].text_with_ws == "She "

View File

@ -4,85 +4,85 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["(can)"])
@pytest.mark.parametrize("text", ["(can)"])
def test_en_tokenizer_splits_no_special(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["can't"])
@pytest.mark.parametrize("text", ["can't"])
def test_en_tokenizer_splits_no_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(can't"])
@pytest.mark.parametrize("text", ["(can't"])
def test_en_tokenizer_splits_prefix_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["can't)"])
@pytest.mark.parametrize("text", ["can't)"])
def test_en_tokenizer_splits_suffix_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(can't)"])
@pytest.mark.parametrize("text", ["(can't)"])
def test_en_tokenizer_splits_even_wrap(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text', ["(can't?)"])
@pytest.mark.parametrize("text", ["(can't?)"])
def test_en_tokenizer_splits_uneven_wrap(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 5
@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
@pytest.mark.parametrize("text,length", [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
def test_en_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
tokens = en_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text', ["U.S.)"])
@pytest.mark.parametrize("text", ["U.S.)"])
def test_en_tokenizer_splits_suffix_interact(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(U.S.)"])
@pytest.mark.parametrize("text", ["(U.S.)"])
def test_en_tokenizer_splits_even_wrap_interact(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(U.S.?)"])
@pytest.mark.parametrize("text", ["(U.S.?)"])
def test_en_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text', ["best-known"])
@pytest.mark.parametrize("text", ["best-known"])
def test_en_tokenizer_splits_hyphens(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
def test_en_tokenizer_splits_numeric_range(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["best.Known", "Hello.World"])
@pytest.mark.parametrize("text", ["best.Known", "Hello.World"])
def test_en_tokenizer_splits_period_infix(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Hello,world", "one,two"])
@pytest.mark.parametrize("text", ["Hello,world", "one,two"])
def test_en_tokenizer_splits_comma_infix(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@ -91,7 +91,7 @@ def test_en_tokenizer_splits_comma_infix(en_tokenizer, text):
assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["best...Known", "best...known"])
@pytest.mark.parametrize("text", ["best...Known", "best...known"])
def test_en_tokenizer_splits_ellipsis_infix(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@ -126,8 +126,10 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer):
@pytest.mark.xfail
def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
# Re Issue #225
tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """
"""you'll have to walk there.\u2014Ariel.""")
tokens = en_tokenizer(
"""Will this road take me to Puddleton?\u2014No, """
"""you'll have to walk there.\u2014Ariel."""
)
assert tokens[6].text == "Puddleton"
assert tokens[7].text == "?"
assert tokens[8].text == "\u2014"

View File

@ -6,19 +6,19 @@ from spacy.util import compile_prefix_regex
from spacy.lang.punctuation import TOKENIZER_PREFIXES
PUNCT_OPEN = ['(', '[', '{', '*']
PUNCT_CLOSE = [')', ']', '}', '*']
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
PUNCT_OPEN = ["(", "[", "{", "*"]
PUNCT_CLOSE = [")", "]", "}", "*"]
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
@pytest.mark.parametrize('text', ["(", "((", "<"])
@pytest.mark.parametrize("text", ["(", "((", "<"])
def test_en_tokenizer_handles_only_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == len(text)
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Hello"])
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("text", ["Hello"])
def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(punct + text)
assert len(tokens) == 2
@ -26,8 +26,8 @@ def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text):
assert tokens[1].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Hello"])
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("text", ["Hello"])
def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(text + punct)
assert len(tokens) == 2
@ -35,9 +35,9 @@ def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text):
assert tokens[1].text == punct
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('punct_add', ["`"])
@pytest.mark.parametrize('text', ["Hello"])
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("punct_add", ["`"])
@pytest.mark.parametrize("text", ["Hello"])
def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
tokens = en_tokenizer(punct + punct_add + text)
assert len(tokens) == 3
@ -46,9 +46,9 @@ def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add,
assert tokens[2].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('punct_add', ["'"])
@pytest.mark.parametrize('text', ["Hello"])
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("punct_add", ["'"])
@pytest.mark.parametrize("text", ["Hello"])
def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
tokens = en_tokenizer(text + punct + punct_add)
assert len(tokens) == 3
@ -57,8 +57,8 @@ def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add
assert tokens[2].text == punct_add
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Hello"])
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("text", ["Hello"])
def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(punct + punct + punct + text)
assert len(tokens) == 4
@ -66,8 +66,8 @@ def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
assert tokens[3].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Hello"])
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("text", ["Hello"])
def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(text + punct + punct + punct)
assert len(tokens) == 4
@ -75,14 +75,14 @@ def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
assert tokens[1].text == punct
@pytest.mark.parametrize('text', ["'The"])
@pytest.mark.parametrize("text", ["'The"])
def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == "'"
@pytest.mark.parametrize('text', ["Hello''"])
@pytest.mark.parametrize("text", ["Hello''"])
def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
@ -90,10 +90,11 @@ def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text):
assert len(tokens_punct) == 1
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('text', ["Hello"])
def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
punct_close, text):
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize("text", ["Hello"])
def test_en_tokenizer_splits_open_close_punct(
en_tokenizer, punct_open, punct_close, text
):
tokens = en_tokenizer(punct_open + text + punct_close)
assert len(tokens) == 3
assert tokens[0].text == punct_open
@ -101,11 +102,12 @@ def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
assert tokens[2].text == punct_close
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
@pytest.mark.parametrize('text', ["Hello"])
def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
punct_open2, punct_close2, text):
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
@pytest.mark.parametrize("text", ["Hello"])
def test_en_tokenizer_two_diff_punct(
en_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
):
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
assert len(tokens) == 5
assert tokens[0].text == punct_open2
@ -115,7 +117,7 @@ def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
assert tokens[4].text == punct_close2
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
@pytest.mark.parametrize("text,punct", [("(can't", "(")])
def test_en_tokenizer_splits_pre_punct_regex(text, punct):
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
match = en_search_prefixes(text)

View File

@ -6,8 +6,8 @@ import pytest
from ...util import get_doc, apply_transition_sequence
@pytest.mark.parametrize('text', ["A test sentence"])
@pytest.mark.parametrize('punct', ['.', '!', '?', ''])
@pytest.mark.parametrize("text", ["A test sentence"])
@pytest.mark.parametrize("punct", [".", "!", "?", ""])
def test_en_sbd_single_punct(en_tokenizer, text, punct):
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
tokens = en_tokenizer(text + punct)
@ -19,16 +19,18 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct):
@pytest.mark.xfail
def test_en_sentence_breaks(en_tokenizer, en_parser):
# fmt: off
text = "This is a sentence . This is another one ."
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
'attr', 'punct']
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT',
'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct']
deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
"attr", "punct"]
transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct", "B-ROOT",
"L-nsubj", "S", "L-attr", "R-attr", "D", "R-punct"]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
apply_transition_sequence(en_parser, doc, transition)
assert len(list(doc.sents)) == 2
for token in doc:
assert token.dep != 0 or token.is_space
assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
assert [token.head.i for token in doc] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]

View File

@ -6,10 +6,10 @@ from ...util import get_doc
def test_en_tagger_load_morph_exc(en_tokenizer):
text = "I like his style."
tags = ['PRP', 'VBP', 'PRP$', 'NN', '.']
morph_exc = {'VBP': {'like': {'lemma': 'luck'}}}
tags = ["PRP", "VBP", "PRP$", "NN", "."]
morph_exc = {"VBP": {"like": {"lemma": "luck"}}}
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags)
assert doc[1].tag_ == 'VBP'
assert doc[1].lemma_ == 'luck'
assert doc[1].tag_ == "VBP"
assert doc[1].lemma_ == "luck"

View File

@ -20,30 +20,48 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
assert len(tokens) == 76
@pytest.mark.parametrize('text,length', [
@pytest.mark.parametrize(
"text,length",
[
("The U.S. Army likes Shock and Awe.", 8),
("U.N. regulations are not a part of their concern.", 10),
("“Isn't it?”", 6),
("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15),
("""'Me too!', Mr. P. Delaware cried. """, 11),
("They ran about 10km.", 6),
pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))])
pytest.param(
"But then the 6,000-year ice age came...", 10, marks=pytest.mark.xfail()
),
],
)
def test_en_tokenizer_handles_cnts(en_tokenizer, text, length):
tokens = en_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text,match', [
('10', True), ('1', True), ('10,000', True), ('10,00', True),
('999.0', True), ('one', True), ('two', True), ('billion', True),
('dog', False), (',', False), ('1/2', True)])
@pytest.mark.parametrize(
"text,match",
[
("10", True),
("1", True),
("10,000", True),
("10,00", True),
("999.0", True),
("one", True),
("two", True),
("billion", True),
("dog", False),
(",", False),
("1/2", True),
],
)
def test_lex_attrs_like_number(en_tokenizer, text, match):
tokens = en_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].like_num == match
@pytest.mark.parametrize('word', ['eleven'])
@pytest.mark.parametrize("word", ["eleven"])
def test_en_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -4,11 +4,15 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text,lemma', [
@pytest.mark.parametrize(
"text,lemma",
[
("aprox.", "aproximadamente"),
("esq.", "esquina"),
("pág.", "página"),
("p.ej.", "por ejemplo")])
("p.ej.", "por ejemplo"),
],
)
def test_es_tokenizer_handles_abbr(es_tokenizer, text, lemma):
tokens = es_tokenizer(text)
assert len(tokens) == 1

View File

@ -20,12 +20,16 @@ en Montevideo y que pregona las bondades de la vida austera."""
assert len(tokens) == 90
@pytest.mark.parametrize('text,length', [
@pytest.mark.parametrize(
"text,length",
[
("¿Por qué José Mujica?", 6),
("“¿Oh no?”", 6),
("""¡Sí! "Vámonos", contestó José Arcadio Buendía""", 11),
("Corrieron aprox. 10km.", 5),
("Y entonces por qué...", 5)])
("Y entonces por qué...", 5),
],
)
def test_es_tokenizer_handles_cnts(es_tokenizer, text, length):
tokens = es_tokenizer(text)
assert len(tokens) == length

View File

@ -5,12 +5,15 @@ import pytest
ABBREVIATION_TESTS = [
('Hyvää uutta vuotta t. siht. Niemelä!', ['Hyvää', 'uutta', 'vuotta', 't.', 'siht.', 'Niemelä', '!']),
('Paino on n. 2.2 kg', ['Paino', 'on', 'n.', '2.2', 'kg'])
(
"Hyvää uutta vuotta t. siht. Niemelä!",
["Hyvää", "uutta", "vuotta", "t.", "siht.", "Niemelä", "!"],
),
("Paino on n. 2.2 kg", ["Paino", "on", "n.", "2.2", "kg"]),
]
@pytest.mark.parametrize('text,expected_tokens', ABBREVIATION_TESTS)
@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS)
def test_fi_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens):
tokens = fi_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]

View File

@ -2,26 +2,26 @@
from __future__ import unicode_literals
import pytest
from .... import util
@pytest.fixture(scope='module')
def fr_tokenizer():
return util.get_lang_class('fr').Defaults.create_tokenizer()
@pytest.mark.parametrize('text', [
"aujourd'hui", "Aujourd'hui", "prud'hommes", "prudhommal"])
@pytest.mark.parametrize(
"text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prudhommal"]
)
def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text):
tokens = fr_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text,lemma', [
@pytest.mark.parametrize(
"text,lemma",
[
("janv.", "janvier"),
("juill.", "juillet"),
("Dr.", "docteur"),
("av.", "avant"),
("sept.", "septembre")])
("sept.", "septembre"),
],
)
def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma):
tokens = fr_tokenizer(text)
assert len(tokens) == 1
@ -57,6 +57,7 @@ def test_fr_tokenizer_handles_title(fr_tokenizer):
assert tokens[2].lemma_ == "ce"
@pytest.mark.xfail
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
text = "Est-ce pas génial?"
tokens = fr_tokenizer(text)
@ -65,7 +66,7 @@ def test_fr_tokenizer_handles_title_2(fr_tokenizer):
assert tokens[0].lemma_ == "être"
def test_fr_tokenizer_handles_title_2(fr_tokenizer):
def test_fr_tokenizer_handles_title_3(fr_tokenizer):
text = "Qu'est-ce que tu fais?"
tokens = fr_tokenizer(text)
assert len(tokens) == 7

View File

@ -16,7 +16,9 @@ def test_fr_lemmatizer_noun_verb_2(fr_tokenizer):
assert tokens[4].lemma_ == "être"
@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN")
@pytest.mark.xfail(
reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN"
)
def test_fr_lemmatizer_noun(fr_tokenizer):
tokens = fr_tokenizer("il y a des Costaricienne.")
assert tokens[4].lemma_ == "Costaricain"

View File

@ -7,11 +7,12 @@ from spacy.lang.punctuation import TOKENIZER_INFIXES
from spacy.lang.char_classes import ALPHA
@pytest.mark.parametrize('text,expected_tokens', [
("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])])
@pytest.mark.parametrize(
"text,expected_tokens", [("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])]
)
def test_issue768(text, expected_tokens):
"""Allow zero-width 'infix' token during the tokenization process."""
SPLIT_INFIX = r'(?<=[{a}]\')(?=[{a}])'.format(a=ALPHA)
SPLIT_INFIX = r"(?<=[{a}]\')(?=[{a}])".format(a=ALPHA)
class FrenchTest(Language):
class Defaults(Language.Defaults):

View File

@ -1,13 +1,5 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from .... import util
@pytest.fixture(scope='module')
def fr_tokenizer():
return util.get_lang_class('fr').Defaults.create_tokenizer()
import pytest
from spacy.lang.fr.lex_attrs import like_num
@ -27,7 +19,7 @@ ou avec un autre vrai humain."""
assert len(tokens) == 113
@pytest.mark.parametrize('word', ['onze', 'onzième'])
@pytest.mark.parametrize("word", ["onze", "onzième"])
def test_fr_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -4,13 +4,15 @@ from __future__ import unicode_literals
import pytest
# fmt: off
GA_TOKEN_EXCEPTION_TESTS = [
('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'Ó', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '(', 'lch.', '600', ')', '.']),
('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise'])
("Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).", ["Niall", "Ó", "Domhnaill", ",", "Rialtas", "na", "hÉireann", "1977", "(", "lch.", "600", ")", "."]),
("Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise", ["Daoine", "a", "bhfuil", "Gaeilge", "acu", ",", "m.sh.", "tusa", "agus", "mise"])
]
# fmt: on
@pytest.mark.parametrize('text,expected_tokens', GA_TOKEN_EXCEPTION_TESTS)
@pytest.mark.parametrize("text,expected_tokens", GA_TOKEN_EXCEPTION_TESTS)
def test_ga_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens):
tokens = ga_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]

View File

@ -4,20 +4,41 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text,expected_tokens',
[('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])])
@pytest.mark.parametrize(
"text,expected_tokens",
[("פייתון היא שפת תכנות דינמית", ["פייתון", "היא", "שפת", "תכנות", "דינמית"])],
)
def test_he_tokenizer_handles_abbreviation(he_tokenizer, text, expected_tokens):
tokens = he_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list
@pytest.mark.parametrize('text,expected_tokens', [
('עקבת אחריו בכל רחבי המדינה.', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '.']),
('עקבת אחריו בכל רחבי המדינה?', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '?']),
('עקבת אחריו בכל רחבי המדינה!', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '!']),
('עקבת אחריו בכל רחבי המדינה..', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '..']),
('עקבת אחריו בכל רחבי המדינה...', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '...'])])
@pytest.mark.parametrize(
"text,expected_tokens",
[
(
"עקבת אחריו בכל רחבי המדינה.",
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "."],
),
(
"עקבת אחריו בכל רחבי המדינה?",
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "?"],
),
(
"עקבת אחריו בכל רחבי המדינה!",
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "!"],
),
(
"עקבת אחריו בכל רחבי המדינה..",
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", ".."],
),
(
"עקבת אחריו בכל רחבי המדינה...",
["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "..."],
),
],
)
def test_he_tokenizer_handles_punct(he_tokenizer, text, expected_tokens):
tokens = he_tokenizer(text)
assert expected_tokens == [token.text for token in tokens]

View File

@ -6,11 +6,11 @@ import pytest
DEFAULT_TESTS = [
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
pytest.mark.xfail(('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'])),
pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()),
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']),
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']),
pytest.mark.xfail(('A .hu.', ['A', '.hu', '.'])),
pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()),
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
('A pl.', ['A', 'pl.']),
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),
@ -228,11 +228,11 @@ QUOTE_TESTS = [
DOT_TESTS = [
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']),
pytest.mark.xfail(('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'])),
pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()),
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']),
('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']),
('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']),
pytest.mark.xfail(('A .hu.', ['A', '.hu', '.'])),
pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()),
('Az egy.ketto.', ['Az', 'egy.ketto', '.']),
('A pl.', ['A', 'pl.']),
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']),

View File

@ -4,85 +4,87 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["(Ma'arif)"])
@pytest.mark.parametrize("text", ["(Ma'arif)"])
def test_id_tokenizer_splits_no_special(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Ma'arif"])
@pytest.mark.parametrize("text", ["Ma'arif"])
def test_id_tokenizer_splits_no_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["(Ma'arif"])
@pytest.mark.parametrize("text", ["(Ma'arif"])
def test_id_tokenizer_splits_prefix_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["Ma'arif)"])
@pytest.mark.parametrize("text", ["Ma'arif)"])
def test_id_tokenizer_splits_suffix_punct(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(Ma'arif)"])
@pytest.mark.parametrize("text", ["(Ma'arif)"])
def test_id_tokenizer_splits_even_wrap(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(Ma'arif?)"])
@pytest.mark.parametrize("text", ["(Ma'arif?)"])
def test_tokenizer_splits_uneven_wrap(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text,length', [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
@pytest.mark.parametrize("text,length", [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)])
def test_id_tokenizer_splits_prefix_interact(id_tokenizer, text, length):
tokens = id_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text', ["S.Kom.)"])
@pytest.mark.parametrize("text", ["S.Kom.)"])
def test_id_tokenizer_splits_suffix_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 2
@pytest.mark.parametrize('text', ["(S.Kom.)"])
@pytest.mark.parametrize("text", ["(S.Kom.)"])
def test_id_tokenizer_splits_even_wrap_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["(S.Kom.?)"])
@pytest.mark.parametrize("text", ["(S.Kom.?)"])
def test_id_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 4
@pytest.mark.parametrize('text,length', [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)])
@pytest.mark.parametrize(
"text,length", [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)]
)
def test_id_tokenizer_splits_hyphens(id_tokenizer, text, length):
tokens = id_tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"])
@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"])
def test_id_tokenizer_splits_numeric_range(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["ini.Budi", "Halo.Bandung"])
@pytest.mark.parametrize("text", ["ini.Budi", "Halo.Bandung"])
def test_id_tokenizer_splits_period_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["Halo,Bandung", "satu,dua"])
@pytest.mark.parametrize("text", ["Halo,Bandung", "satu,dua"])
def test_id_tokenizer_splits_comma_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3
@ -91,7 +93,7 @@ def test_id_tokenizer_splits_comma_infix(id_tokenizer, text):
assert tokens[2].text == text.split(",")[1]
@pytest.mark.parametrize('text', ["halo...Bandung", "dia...pergi"])
@pytest.mark.parametrize("text", ["halo...Bandung", "dia...pergi"])
def test_id_tokenizer_splits_ellipsis_infix(id_tokenizer, text):
tokens = id_tokenizer(text)
assert len(tokens) == 3

View File

@ -5,7 +5,7 @@ import pytest
from spacy.lang.id.lex_attrs import like_num
@pytest.mark.parametrize('word', ['sebelas'])
@pytest.mark.parametrize("word", ["sebelas"])
def test_id_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -4,12 +4,10 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('word,lemma', [
('新しく', '新しい'),
('赤く', '赤い'),
('すごく', '凄い'),
('いただきました', '頂く'),
('なった', '成る')])
@pytest.mark.parametrize(
"word,lemma",
[("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "凄い"), ("いただきました", "頂く"), ("なった", "成る")],
)
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
test_lemma = ja_tokenizer(word)[0].lemma_
assert test_lemma == lemma

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import pytest
# fmt: off
TOKENIZER_TESTS = [
("日本語だよ", ['日本', '', '', '']),
("東京タワーの近くに住んでいます。", ['東京', 'タワー', '', '近く', '', '住ん', '', '', 'ます', '']),
@ -27,21 +28,22 @@ POS_TESTS = [
('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
]
# fmt: on
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
def test_ja_tokenizer(ja_tokenizer, text, expected_tokens):
tokens = [token.text for token in ja_tokenizer(text)]
assert tokens == expected_tokens
@pytest.mark.parametrize('text,expected_tags', TAG_TESTS)
def test_ja_tokenizer(ja_tokenizer, text, expected_tags):
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
tags = [token.tag_ for token in ja_tokenizer(text)]
assert tags == expected_tags
@pytest.mark.parametrize('text,expected_pos', POS_TESTS)
def test_ja_tokenizer(ja_tokenizer, text, expected_pos):
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
pos = [token.pos_ for token in ja_tokenizer(text)]
assert pos == expected_pos

View File

@ -5,12 +5,18 @@ import pytest
NB_TOKEN_EXCEPTION_TESTS = [
('Smørsausen brukes bl.a. til fisk', ['Smørsausen', 'brukes', 'bl.a.', 'til', 'fisk']),
('Jeg kommer først kl. 13 pga. diverse forsinkelser', ['Jeg', 'kommer', 'først', 'kl.', '13', 'pga.', 'diverse', 'forsinkelser'])
(
"Smørsausen brukes bl.a. til fisk",
["Smørsausen", "brukes", "bl.a.", "til", "fisk"],
),
(
"Jeg kommer først kl. 13 pga. diverse forsinkelser",
["Jeg", "kommer", "først", "kl.", "13", "pga.", "diverse", "forsinkelser"],
),
]
@pytest.mark.parametrize('text,expected_tokens', NB_TOKEN_EXCEPTION_TESTS)
@pytest.mark.parametrize("text,expected_tokens", NB_TOKEN_EXCEPTION_TESTS)
def test_nb_tokenizer_handles_exception_cases(nb_tokenizer, text, expected_tokens):
tokens = nb_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]

View File

@ -5,7 +5,7 @@ import pytest
from spacy.lang.nl.lex_attrs import like_num
@pytest.mark.parametrize('word', ['elf', 'elfde'])
@pytest.mark.parametrize("word", ["elf", "elfde"])
def test_nl_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -5,7 +5,7 @@ import pytest
from spacy.lang.pt.lex_attrs import like_num
@pytest.mark.parametrize('word', ['onze', 'quadragésimo'])
@pytest.mark.parametrize("word", ["onze", "quadragésimo"])
def test_pt_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -4,11 +4,15 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('string,lemma', [
('câini', 'câine'),
('expedițiilor', 'expediție'),
('pensete', 'pensetă'),
('erau', 'fi')])
@pytest.mark.parametrize(
"string,lemma",
[
("câini", "câine"),
("expedițiilor", "expediție"),
("pensete", "pensetă"),
("erau", "fi"),
],
)
def test_ro_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma):
tokens = ro_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -5,17 +5,20 @@ import pytest
TEST_CASES = [
('Adresa este str. Principală nr. 5.', ['Adresa', 'este', 'str.', 'Principală', 'nr.', '5', '.']),
('Teste, etc.', ['Teste', ',', 'etc.']),
('Lista, ș.a.m.d.', ['Lista', ',', 'ș.a.m.d.']),
('Și d.p.d.v. al...', ['Și', 'd.p.d.v.', 'al', '...']),
(
"Adresa este str. Principală nr. 5.",
["Adresa", "este", "str.", "Principală", "nr.", "5", "."],
),
("Teste, etc.", ["Teste", ",", "etc."]),
("Lista, ș.a.m.d.", ["Lista", ",", "ș.a.m.d."]),
("Și d.p.d.v. al...", ["Și", "d.p.d.v.", "al", "..."]),
# number tests
('Clasa a 4-a.', ['Clasa', 'a', '4-a', '.']),
('Al 12-lea ceas.', ['Al', '12-lea', 'ceas', '.'])
("Clasa a 4-a.", ["Clasa", "a", "4-a", "."]),
("Al 12-lea ceas.", ["Al", "12-lea", "ceas", "."]),
]
@pytest.mark.parametrize('text,expected_tokens', TEST_CASES)
@pytest.mark.parametrize("text,expected_tokens", TEST_CASES)
def test_ro_tokenizer_handles_testcases(ro_tokenizer, text, expected_tokens):
tokens = ro_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]

View File

@ -4,10 +4,10 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text,norms', [
("пн.", ["понедельник"]),
("пт.", ["пятница"]),
("дек.", ["декабрь"])])
@pytest.mark.parametrize(
"text,norms",
[("пн.", ["понедельник"]), ("пт.", ["пятница"]), ("дек.", ["декабрь"])],
)
def test_ru_tokenizer_abbrev_exceptions(ru_tokenizer, text, norms):
tokens = ru_tokenizer(text)
assert len(tokens) == 1

View File

@ -9,55 +9,71 @@ from ...util import get_doc
@pytest.fixture
def ru_lemmatizer():
pymorphy = pytest.importorskip('pymorphy2')
pymorphy = pytest.importorskip("pymorphy2")
return Russian.Defaults.create_lemmatizer()
def test_ru_doc_lemmatization(ru_tokenizer):
words = ['мама', 'мыла', 'раму']
tags = ['NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing',
'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act',
'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing']
words = ["мама", "мыла", "раму"]
tags = [
"NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing",
"VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
"NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing",
]
doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags)
lemmas = [token.lemma_ for token in doc]
assert lemmas == ['мама', 'мыть', 'рама']
assert lemmas == ["мама", "мыть", "рама"]
@pytest.mark.parametrize('text,lemmas', [
('гвоздики', ['гвоздик', 'гвоздика']),
('люди', ['человек']),
('реки', ['река']),
('кольцо', ['кольцо']),
('пепперони', ['пепперони'])])
@pytest.mark.parametrize(
"text,lemmas",
[
("гвоздики", ["гвоздик", "гвоздика"]),
("люди", ["человек"]),
("реки", ["река"]),
("кольцо", ["кольцо"]),
("пепперони", ["пепперони"]),
],
)
def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas):
assert sorted(ru_lemmatizer.noun(text)) == lemmas
@pytest.mark.models('ru')
@pytest.mark.parametrize('text,pos,morphology,lemma', [
('рой', 'NOUN', None, 'рой'),
('рой', 'VERB', None, 'рыть'),
('клей', 'NOUN', None, 'клей'),
('клей', 'VERB', None, 'клеить'),
('три', 'NUM', None, 'три'),
('кос', 'NOUN', {'Number': 'Sing'}, 'кос'),
('кос', 'NOUN', {'Number': 'Plur'}, 'коса'),
('кос', 'ADJ', None, 'косой'),
('потом', 'NOUN', None, 'пот'),
('потом', 'ADV', None, 'потом')])
def test_ru_lemmatizer_works_with_different_pos_homonyms(ru_lemmatizer, text, pos, morphology, lemma):
@pytest.mark.models("ru")
@pytest.mark.parametrize(
"text,pos,morphology,lemma",
[
("рой", "NOUN", None, "рой"),
("рой", "VERB", None, "рыть"),
("клей", "NOUN", None, "клей"),
("клей", "VERB", None, "клеить"),
("три", "NUM", None, "три"),
("кос", "NOUN", {"Number": "Sing"}, "кос"),
("кос", "NOUN", {"Number": "Plur"}, "коса"),
("кос", "ADJ", None, "косой"),
("потом", "NOUN", None, "пот"),
("потом", "ADV", None, "потом"),
],
)
def test_ru_lemmatizer_works_with_different_pos_homonyms(
ru_lemmatizer, text, pos, morphology, lemma
):
assert ru_lemmatizer(text, pos, morphology) == [lemma]
@pytest.mark.parametrize('text,morphology,lemma', [
('гвоздики', {'Gender': 'Fem'}, 'гвоздика'),
('гвоздики', {'Gender': 'Masc'}, 'гвоздик'),
('вина', {'Gender': 'Fem'}, 'вина'),
('вина', {'Gender': 'Neut'}, 'вино')])
@pytest.mark.parametrize(
"text,morphology,lemma",
[
("гвоздики", {"Gender": "Fem"}, "гвоздика"),
("гвоздики", {"Gender": "Masc"}, "гвоздик"),
("вина", {"Gender": "Fem"}, "вина"),
("вина", {"Gender": "Neut"}, "вино"),
],
)
def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma):
assert ru_lemmatizer.noun(text, morphology) == [lemma]
def test_ru_lemmatizer_punct(ru_lemmatizer):
assert ru_lemmatizer.punct('«') == ['"']
assert ru_lemmatizer.punct('»') == ['"']
assert ru_lemmatizer.punct("«") == ['"']
assert ru_lemmatizer.punct("»") == ['"']

View File

@ -5,7 +5,7 @@ import pytest
from spacy.lang.ru.lex_attrs import like_num
@pytest.mark.parametrize('word', ['одиннадцать'])
@pytest.mark.parametrize("word", ["одиннадцать"])
def test_ru_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())

View File

@ -4,19 +4,19 @@ from __future__ import unicode_literals
import pytest
PUNCT_OPEN = ['(', '[', '{', '*']
PUNCT_CLOSE = [')', ']', '}', '*']
PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
PUNCT_OPEN = ["(", "[", "{", "*"]
PUNCT_CLOSE = [")", "]", "}", "*"]
PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")]
@pytest.mark.parametrize('text', ["(", "((", "<"])
@pytest.mark.parametrize("text", ["(", "((", "<"])
def test_ru_tokenizer_handles_only_punct(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert len(tokens) == len(text)
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Привет"])
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("text", ["Привет"])
def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text):
tokens = ru_tokenizer(punct + text)
assert len(tokens) == 2
@ -24,8 +24,8 @@ def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text):
assert tokens[1].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Привет"])
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("text", ["Привет"])
def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text):
tokens = ru_tokenizer(text + punct)
assert len(tokens) == 2
@ -33,9 +33,9 @@ def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text):
assert tokens[1].text == punct
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('punct_add', ["`"])
@pytest.mark.parametrize('text', ["Привет"])
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("punct_add", ["`"])
@pytest.mark.parametrize("text", ["Привет"])
def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, text):
tokens = ru_tokenizer(punct + punct_add + text)
assert len(tokens) == 3
@ -44,9 +44,9 @@ def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add,
assert tokens[2].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('punct_add', ["'"])
@pytest.mark.parametrize('text', ["Привет"])
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("punct_add", ["'"])
@pytest.mark.parametrize("text", ["Привет"])
def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add, text):
tokens = ru_tokenizer(text + punct + punct_add)
assert len(tokens) == 3
@ -55,8 +55,8 @@ def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add
assert tokens[2].text == punct_add
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Привет"])
@pytest.mark.parametrize("punct", PUNCT_OPEN)
@pytest.mark.parametrize("text", ["Привет"])
def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text):
tokens = ru_tokenizer(punct + punct + punct + text)
assert len(tokens) == 4
@ -64,8 +64,8 @@ def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text):
assert tokens[3].text == text
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Привет"])
@pytest.mark.parametrize("punct", PUNCT_CLOSE)
@pytest.mark.parametrize("text", ["Привет"])
def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text):
tokens = ru_tokenizer(text + punct + punct + punct)
assert len(tokens) == 4
@ -73,14 +73,14 @@ def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text):
assert tokens[1].text == punct
@pytest.mark.parametrize('text', ["'Тест"])
@pytest.mark.parametrize("text", ["'Тест"])
def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == "'"
@pytest.mark.parametrize('text', ["Тест''"])
@pytest.mark.parametrize("text", ["Тест''"])
def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert len(tokens) == 2
@ -88,10 +88,11 @@ def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text):
assert len(tokens_punct) == 1
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('text', ["Тест"])
def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open,
punct_close, text):
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize("text", ["Тест"])
def test_ru_tokenizer_splits_open_close_punct(
ru_tokenizer, punct_open, punct_close, text
):
tokens = ru_tokenizer(punct_open + text + punct_close)
assert len(tokens) == 3
assert tokens[0].text == punct_open
@ -99,11 +100,12 @@ def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open,
assert tokens[2].text == punct_close
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
@pytest.mark.parametrize('text', ["Тест"])
def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close,
punct_open2, punct_close2, text):
@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED)
@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")])
@pytest.mark.parametrize("text", ["Тест"])
def test_ru_tokenizer_two_diff_punct(
ru_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text
):
tokens = ru_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
assert len(tokens) == 5
assert tokens[0].text == punct_open2
@ -113,7 +115,7 @@ def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close,
assert tokens[4].text == punct_close2
@pytest.mark.parametrize('text', ["Тест."])
@pytest.mark.parametrize("text", ["Тест."])
def test_ru_tokenizer_splits_trailing_dot(ru_tokenizer, text):
tokens = ru_tokenizer(text)
assert tokens[1].text == "."

View File

@ -5,20 +5,29 @@ import pytest
SV_TOKEN_EXCEPTION_TESTS = [
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']),
('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."])
(
"Smörsåsen används bl.a. till fisk",
["Smörsåsen", "används", "bl.a.", "till", "fisk"],
),
(
"Jag kommer först kl. 13 p.g.a. diverse förseningar",
["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
),
(
"Anders I. tycker om ord med i i.",
["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."],
),
]
@pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS)
@pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS)
def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens):
tokens = sv_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list
@pytest.mark.parametrize('text', ["driveru", "hajaru", "Serru", "Fixaru"])
@pytest.mark.parametrize("text", ["driveru", "hajaru", "Serru", "Fixaru"])
def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text):
tokens = sv_tokenizer(text)
assert len(tokens) == 2

View File

@ -6,53 +6,85 @@ from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA
from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape
@pytest.mark.parametrize('text', ["dog"])
@pytest.mark.parametrize("text", ["dog"])
def test_attrs_key(text):
assert intify_attrs({"ORTH": text}) == {ORTH: text}
assert intify_attrs({"NORM": text}) == {NORM: text}
assert intify_attrs({"lemma": text}, strings_map={text: 10}) == {LEMMA: 10}
@pytest.mark.parametrize('text', ["dog"])
@pytest.mark.parametrize("text", ["dog"])
def test_attrs_idempotence(text):
int_attrs = intify_attrs({"lemma": text, 'is_alpha': True}, strings_map={text: 10})
int_attrs = intify_attrs({"lemma": text, "is_alpha": True}, strings_map={text: 10})
assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True}
@pytest.mark.parametrize('text', ["dog"])
@pytest.mark.parametrize("text", ["dog"])
def test_attrs_do_deprecated(text):
int_attrs = intify_attrs({"F": text, 'is_alpha': True}, strings_map={text: 10},
_do_deprecated=True)
int_attrs = intify_attrs(
{"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True
)
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
@pytest.mark.parametrize('text,match', [(',', True), (' ', False), ('a', False)])
@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)])
def test_lex_attrs_is_punct(text, match):
assert is_punct(text) == match
@pytest.mark.parametrize('text,match', [(',', True), ('£', False), ('', False)])
@pytest.mark.parametrize("text,match", [(",", True), ("£", False), ("", False)])
def test_lex_attrs_is_ascii(text, match):
assert is_ascii(text) == match
@pytest.mark.parametrize('text,match', [('$', True), ('£', True), ('', False),
('', True), ('¥', True), ('¢', True),
('a', False), ('www.google.com', False), ('dog', False)])
@pytest.mark.parametrize(
"text,match",
[
("$", True),
("£", True),
("", False),
("", True),
("¥", True),
("¢", True),
("a", False),
("www.google.com", False),
("dog", False),
],
)
def test_lex_attrs_is_currency(text, match):
assert is_currency(text) == match
@pytest.mark.parametrize('text,match', [
('www.google.com', True), ('google.com', True), ('sydney.com', True),
('2girls1cup.org', True), ('http://stupid', True), ('www.hi', True),
('dog', False), ('1.2', False), ('1.a', False), ('hello.There', False)])
@pytest.mark.parametrize(
"text,match",
[
("www.google.com", True),
("google.com", True),
("sydney.com", True),
("2girls1cup.org", True),
("http://stupid", True),
("www.hi", True),
("dog", False),
("1.2", False),
("1.a", False),
("hello.There", False),
],
)
def test_lex_attrs_like_url(text, match):
assert like_url(text) == match
@pytest.mark.parametrize('text,shape', [
('Nasa', 'Xxxx'), ('capitalized', 'xxxx'), ('999999999', 'dddd'),
('C3P0', 'XdXd'), (',', ','), ('\n', '\n'), ('``,-', '``,-')])
@pytest.mark.parametrize(
"text,shape",
[
("Nasa", "Xxxx"),
("capitalized", "xxxx"),
("999999999", "dddd"),
("C3P0", "XdXd"),
(",", ","),
("\n", "\n"),
("``,-", "``,-"),
],
)
def test_lex_attrs_word_shape(text, shape):
assert word_shape(text) == shape

View File

@ -4,8 +4,9 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text,expected_tokens', [
("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])])
@pytest.mark.parametrize(
"text,expected_tokens", [("คุณรักผมไหม", ["คุณ", "รัก", "ผม", "ไหม"])]
)
def test_th_tokenizer(th_tokenizer, text, expected_tokens):
tokens = [token.text for token in th_tokenizer(text)]
assert tokens == expected_tokens

View File

@ -4,14 +4,18 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('string,lemma', [
('evlerimizdeki', 'ev'),
('işlerimizi', ''),
('biran', 'biran'),
('bitirmeliyiz', 'bitir'),
('isteklerimizi', 'istek'),
('karşılaştırmamızın', 'karşılaştır'),
('çoğulculuktan', 'çoğulcu')])
@pytest.mark.parametrize(
"string,lemma",
[
("evlerimizdeki", "ev"),
("işlerimizi", ""),
("biran", "biran"),
("bitirmeliyiz", "bitir"),
("isteklerimizi", "istek"),
("karşılaştırmamızın", "karşılaştır"),
("çoğulculuktan", "çoğulcu"),
],
)
def test_tr_lemmatizer_lookup_assigns(tr_tokenizer, string, lemma):
tokens = tr_tokenizer(string)
assert tokens[0].lemma_ == lemma

View File

@ -6,14 +6,16 @@ import pytest
INFIX_HYPHEN_TESTS = [
("Явым-төшем күләме.", "Явым-төшем күләме .".split()),
("Хатын-кыз киеме.", "Хатын-кыз киеме .".split())
("Хатын-кыз киеме.", "Хатын-кыз киеме .".split()),
]
PUNC_INSIDE_WORDS_TESTS = [
("Пассаҗир саны - 2,13 млн — кеше/көндә (2010), 783,9 млн. кеше/елда.",
(
"Пассаҗир саны - 2,13 млн — кеше/көндә (2010), 783,9 млн. кеше/елда.",
"Пассаҗир саны - 2,13 млн — кеше / көндә ( 2010 ) ,"
" 783,9 млн. кеше / елда .".split()),
("Ту\"кай", "Ту \" кай".split())
" 783,9 млн. кеше / елда .".split(),
),
('Ту"кай', 'Ту " кай'.split()),
]
MIXED_ORDINAL_NUMS_TESTS = [
@ -22,14 +24,14 @@ MIXED_ORDINAL_NUMS_TESTS = [
ABBREV_TESTS = [
("«3 елда (б.э.к.) туган", "« 3 елда ( б.э.к. ) туган".split()),
("тукымадан һ.б.ш. тегелгән.", "тукымадан һ.б.ш. тегелгән .".split())
("тукымадан һ.б.ш. тегелгән.", "тукымадан һ.б.ш. тегелгән .".split()),
]
NAME_ABBREV_TESTS = [
("Ә.Тукай", "Ә.Тукай".split()),
("Ә.тукай", "Ә.тукай".split()),
("ә.Тукай", "ә . Тукай".split()),
("Миләүшә.", "Миләүшә .".split())
("Миләүшә.", "Миләүшә .".split()),
]
TYPOS_IN_PUNC_TESTS = [
@ -37,11 +39,12 @@ TYPOS_IN_PUNC_TESTS = [
("«3 елда,туган", "« 3 елда , туган".split()),
("«3 елда,туган.", "« 3 елда , туган .".split()),
("Ул эшли(кайчан?)", "Ул эшли ( кайчан ? )".split()),
("Ул (кайчан?)эшли", "Ул ( кайчан ?) эшли".split()) # "?)" => "?)" or "? )"
("Ул (кайчан?)эшли", "Ул ( кайчан ?) эшли".split()), # "?)" => "?)" or "? )"
]
LONG_TEXTS_TESTS = [
("Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы"
(
"Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы"
"якларда яшәгәннәр, шуңа күрә аларга кием кирәк булмаган.Йөз"
"меңнәрчә еллар үткән, борынгы кешеләр акрынлап Европа һәм Азиянең"
"салкын илләрендә дә яши башлаганнар. Алар кырыс һәм салкын"
@ -50,17 +53,25 @@ LONG_TEXTS_TESTS = [
"якларда яшәгәннәр , шуңа күрә аларга кием кирәк булмаган . Йөз"
"меңнәрчә еллар үткән , борынгы кешеләр акрынлап Европа һәм Азиянең"
"салкын илләрендә дә яши башлаганнар . Алар кырыс һәм салкын"
"кышлардан саклану өчен кием-салым уйлап тапканнар - итәк .".split()
"кышлардан саклану өчен кием-салым уйлап тапканнар - итәк .".split(),
)
]
TESTCASES = (INFIX_HYPHEN_TESTS + PUNC_INSIDE_WORDS_TESTS +
MIXED_ORDINAL_NUMS_TESTS + ABBREV_TESTS + NAME_ABBREV_TESTS +
LONG_TEXTS_TESTS + TYPOS_IN_PUNC_TESTS)
TESTCASES = (
INFIX_HYPHEN_TESTS
+ PUNC_INSIDE_WORDS_TESTS
+ MIXED_ORDINAL_NUMS_TESTS
+ ABBREV_TESTS
+ NAME_ABBREV_TESTS
+ LONG_TEXTS_TESTS
+ TYPOS_IN_PUNC_TESTS
)
NORM_TESTCASES = [
("тукымадан һ.б.ш. тегелгән.",
["тукымадан", "һәм башка шундыйлар", "тегелгән", "."])
(
"тукымадан һ.б.ш. тегелгән.",
["тукымадан", "һәм башка шундыйлар", "тегелгән", "."],
)
]
@ -70,7 +81,7 @@ def test_tt_tokenizer_handles_testcases(tt_tokenizer, text, expected_tokens):
assert expected_tokens == tokens
@pytest.mark.parametrize('text,norms', NORM_TESTCASES)
@pytest.mark.parametrize("text,norms", NORM_TESTCASES)
def test_tt_tokenizer_handles_norm_exceptions(tt_tokenizer, text, norms):
tokens = tt_tokenizer(text)
assert [token.norm_ for token in tokens] == norms

View File

@ -13,9 +13,7 @@ def test_ur_tokenizer_handles_long_text(ur_tokenizer):
assert len(tokens) == 77
@pytest.mark.parametrize('text,length', [
("تحریر باسط حبیب", 3),
("میرا پاکستان", 2)])
@pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)])
def test_ur_tokenizer_handles_cnts(ur_tokenizer, text, length):
tokens = ur_tokenizer(text)
assert len(tokens) == length

View File

@ -10,9 +10,11 @@ from ..util import get_doc
@pytest.fixture
def matcher(en_vocab):
rules = {'JS': [[{'ORTH': 'JavaScript'}]],
'GoogleNow': [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]],
'Java': [[{'LOWER': 'java'}]]}
rules = {
"JS": [[{"ORTH": "JavaScript"}]],
"GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]],
"Java": [[{"LOWER": "java"}]],
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
matcher.add(key, None, *patterns)
@ -21,44 +23,44 @@ def matcher(en_vocab):
def test_matcher_from_api_docs(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{'ORTH': 'test'}]
pattern = [{"ORTH": "test"}]
assert len(matcher) == 0
matcher.add('Rule', None, pattern)
matcher.add("Rule", None, pattern)
assert len(matcher) == 1
matcher.remove('Rule')
assert 'Rule' not in matcher
matcher.add('Rule', None, pattern)
assert 'Rule' in matcher
on_match, patterns = matcher.get('Rule')
matcher.remove("Rule")
assert "Rule" not in matcher
matcher.add("Rule", None, pattern)
assert "Rule" in matcher
on_match, patterns = matcher.get("Rule")
assert len(patterns[0])
def test_matcher_from_usage_docs(en_vocab):
text = "Wow 😀 This is really cool! 😂 😂"
doc = Doc(en_vocab, words=text.split(' '))
pos_emoji = ['😀', '😃', '😂', '🤣', '😊', '😍']
pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]
doc = Doc(en_vocab, words=text.split(" "))
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i]
if doc.vocab.strings[match_id] == 'HAPPY':
if doc.vocab.strings[match_id] == "HAPPY":
doc.sentiment += 0.1
span = doc[start : end]
span = doc[start:end]
token = span.merge()
token.vocab[token.text].norm_ = 'happy emoji'
token.vocab[token.text].norm_ = "happy emoji"
matcher = Matcher(en_vocab)
matcher.add('HAPPY', label_sentiment, *pos_patterns)
matcher.add("HAPPY", label_sentiment, *pos_patterns)
matches = matcher(doc)
assert doc.sentiment != 0
assert doc[1].norm_ == 'happy emoji'
assert doc[1].norm_ == "happy emoji"
def test_matcher_len_contains(matcher):
assert len(matcher) == 3
matcher.add('TEST', None, [{'ORTH': 'test'}])
assert 'TEST' in matcher
assert 'TEST2' not in matcher
matcher.add("TEST", None, [{"ORTH": "test"}])
assert "TEST" in matcher
assert "TEST2" not in matcher
def test_matcher_no_match(matcher):
@ -68,38 +70,40 @@ def test_matcher_no_match(matcher):
def test_matcher_match_start(matcher):
doc = Doc(matcher.vocab, words=["JavaScript", "is", "good"])
assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)]
assert matcher(doc) == [(matcher.vocab.strings["JS"], 0, 1)]
def test_matcher_match_end(matcher):
words = ["I", "like", "java"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)]
assert matcher(doc) == [(doc.vocab.strings["Java"], 2, 3)]
def test_matcher_match_middle(matcher):
words = ["I", "like", "Google", "Now", "best"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)]
assert matcher(doc) == [(doc.vocab.strings["GoogleNow"], 2, 4)]
def test_matcher_match_multi(matcher):
words = ["I", "like", "Google", "Now", "and", "java", "best"]
doc = Doc(matcher.vocab, words=words)
assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4),
(doc.vocab.strings['Java'], 5, 6)]
assert matcher(doc) == [
(doc.vocab.strings["GoogleNow"], 2, 4),
(doc.vocab.strings["Java"], 5, 6),
]
def test_matcher_empty_dict(en_vocab):
"""Test matcher allows empty token specs, meaning match on any token."""
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=["a", "b", "c"])
matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}])
matcher.add("A.C", None, [{"ORTH": "a"}, {}, {"ORTH": "c"}])
matches = matcher(doc)
assert len(matches) == 1
assert matches[0][1:] == (0, 3)
matcher = Matcher(en_vocab)
matcher.add('A.', None, [{'ORTH': 'a'}, {}])
matcher.add("A.", None, [{"ORTH": "a"}, {}])
matches = matcher(doc)
assert matches[0][1:] == (0, 2)
@ -107,8 +111,8 @@ def test_matcher_empty_dict(en_vocab):
def test_matcher_operator_shadow(en_vocab):
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=["a", "b", "c"])
pattern = [{'ORTH': 'a'}, {"IS_ALPHA": True, "OP": "+"}, {'ORTH': 'c'}]
matcher.add('A.C', None, pattern)
pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
matcher.add("A.C", None, pattern)
matches = matcher(doc)
assert len(matches) == 1
assert matches[0][1:] == (0, 3)
@ -117,43 +121,48 @@ def test_matcher_operator_shadow(en_vocab):
def test_matcher_match_zero(matcher):
words1 = 'He said , " some words " ...'.split()
words2 = 'He said , " some three words " ...'.split()
pattern1 = [{'ORTH': '"'},
{'OP': '!', 'IS_PUNCT': True},
{'OP': '!', 'IS_PUNCT': True},
{'ORTH': '"'}]
pattern2 = [{'ORTH': '"'},
{'IS_PUNCT': True},
{'IS_PUNCT': True},
{'IS_PUNCT': True},
{'ORTH': '"'}]
matcher.add('Quote', None, pattern1)
pattern1 = [
{"ORTH": '"'},
{"OP": "!", "IS_PUNCT": True},
{"OP": "!", "IS_PUNCT": True},
{"ORTH": '"'},
]
pattern2 = [
{"ORTH": '"'},
{"IS_PUNCT": True},
{"IS_PUNCT": True},
{"IS_PUNCT": True},
{"ORTH": '"'},
]
matcher.add("Quote", None, pattern1)
doc = Doc(matcher.vocab, words=words1)
assert len(matcher(doc)) == 1
doc = Doc(matcher.vocab, words=words2)
assert len(matcher(doc)) == 0
matcher.add('Quote', None, pattern2)
matcher.add("Quote", None, pattern2)
assert len(matcher(doc)) == 0
def test_matcher_match_zero_plus(matcher):
words = 'He said , " some words " ...'.split()
pattern = [{'ORTH': '"'},
{'OP': '*', 'IS_PUNCT': False},
{'ORTH': '"'}]
pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
matcher = Matcher(matcher.vocab)
matcher.add('Quote', None, pattern)
matcher.add("Quote", None, pattern)
doc = Doc(matcher.vocab, words=words)
assert len(matcher(doc)) == 1
def test_matcher_match_one_plus(matcher):
control = Matcher(matcher.vocab)
control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}])
doc = Doc(control.vocab, words=['Philippe', 'Philippe'])
control.add("BasicPhilippe", None, [{"ORTH": "Philippe"}])
doc = Doc(control.vocab, words=["Philippe", "Philippe"])
m = control(doc)
assert len(m) == 2
matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'},
{'ORTH': 'Philippe', 'OP': '+'}])
matcher.add(
"KleenePhilippe",
None,
[{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}],
)
m = matcher(doc)
assert len(m) == 1
@ -161,54 +170,70 @@ def test_matcher_match_one_plus(matcher):
def test_matcher_any_token_operator(en_vocab):
"""Test that patterns with "any token" {} work with operators."""
matcher = Matcher(en_vocab)
matcher.add('TEST', None, [{'ORTH': 'test'}, {'OP': '*'}])
doc = Doc(en_vocab, words=['test', 'hello', 'world'])
matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}])
doc = Doc(en_vocab, words=["test", "hello", "world"])
matches = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches) == 3
assert matches[0] == 'test'
assert matches[1] == 'test hello'
assert matches[2] == 'test hello world'
assert matches[0] == "test"
assert matches[1] == "test hello"
assert matches[2] == "test hello world"
@pytest.fixture
def text():
return u"The quick brown fox jumped over the lazy fox"
return "The quick brown fox jumped over the lazy fox"
@pytest.fixture
def heads():
return [3,2,1,1,0,-1,2,1,-3]
return [3, 2, 1, 1, 0, -1, 2, 1, -3]
@pytest.fixture
def deps():
return ['det', 'amod', 'amod', 'nsubj', 'prep', 'pobj', 'det', 'amod']
return ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
@pytest.fixture
def dependency_tree_matcher(en_vocab):
is_brown_yellow = lambda text: bool(re.compile(r'brown|yellow|over').match(text))
def is_brown_yellow(text):
return bool(re.compile(r"brown|yellow|over").match(text))
IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow)
pattern1 = [
{'SPEC': {'NODE_NAME': 'fox'}, 'PATTERN': {'ORTH': 'fox'}},
{'SPEC': {'NODE_NAME': 'q', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'},'PATTERN': {'LOWER': u'quick'}},
{'SPEC': {'NODE_NAME': 'r', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'}, 'PATTERN': {IS_BROWN_YELLOW: True}}
{"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}},
{
"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
"PATTERN": {"LOWER": "quick"},
},
{
"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
"PATTERN": {IS_BROWN_YELLOW: True},
},
]
pattern2 = [
{'SPEC': {'NODE_NAME': 'jumped'}, 'PATTERN': {'ORTH': 'jumped'}},
{'SPEC': {'NODE_NAME': 'fox', 'NBOR_RELOP': '>', 'NBOR_NAME': 'jumped'},'PATTERN': {'LOWER': u'fox'}},
{'SPEC': {'NODE_NAME': 'over', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'}, 'PATTERN': {IS_BROWN_YELLOW: True}}
{"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
{
"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"},
"PATTERN": {"LOWER": "fox"},
},
{
"SPEC": {"NODE_NAME": "over", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},
"PATTERN": {IS_BROWN_YELLOW: True},
},
]
matcher = DependencyTreeMatcher(en_vocab)
matcher.add('pattern1', None, pattern1)
matcher.add('pattern2', None, pattern2)
matcher.add("pattern1", None, pattern1)
matcher.add("pattern2", None, pattern2)
return matcher
def test_dependency_tree_matcher_compile(dependency_tree_matcher):
assert len(dependency_tree_matcher) == 2
def test_dependency_tree_matcher(dependency_tree_matcher,text,heads,deps):
doc = get_doc(dependency_tree_matcher.vocab,text.split(),heads=heads,deps=deps)
def test_dependency_tree_matcher(dependency_tree_matcher, text, heads, deps):
doc = get_doc(dependency_tree_matcher.vocab, text.split(), heads=heads, deps=deps)
matches = dependency_tree_matcher(doc)
assert len(matches) == 2

View File

@ -7,17 +7,25 @@ from spacy.matcher import Matcher
from spacy.tokens import Doc
pattern1 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}]
pattern2 = [{'ORTH':'A', 'OP':'*'}, {'ORTH':'A', 'OP':'1'}]
pattern3 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'1'}]
pattern4 = [{'ORTH':'B', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]
pattern5 = [{'ORTH':'B', 'OP':'*'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}]
pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}]
pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A", "OP": "1"}]
pattern3 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "1"}]
pattern4 = [
{"ORTH": "B", "OP": "1"},
{"ORTH": "A", "OP": "*"},
{"ORTH": "B", "OP": "1"},
]
pattern5 = [
{"ORTH": "B", "OP": "*"},
{"ORTH": "A", "OP": "*"},
{"ORTH": "B", "OP": "1"},
]
re_pattern1 = 'AA*'
re_pattern2 = 'A*A'
re_pattern3 = 'AA'
re_pattern4 = 'BA*B'
re_pattern5 = 'B*A*B'
re_pattern1 = "AA*"
re_pattern2 = "A*A"
re_pattern3 = "AA"
re_pattern4 = "BA*B"
re_pattern5 = "B*A*B"
@pytest.fixture
@ -27,17 +35,20 @@ def text():
@pytest.fixture
def doc(en_tokenizer, text):
doc = en_tokenizer(' '.join(text))
doc = en_tokenizer(" ".join(text))
return doc
@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern', [
(pattern1, re_pattern1),
(pattern2, re_pattern2),
(pattern3, re_pattern3),
@pytest.mark.parametrize(
"pattern,re_pattern",
[
pytest.param(pattern1, re_pattern1, marks=pytest.mark.xfail()),
pytest.param(pattern2, re_pattern2, marks=pytest.mark.xfail()),
pytest.param(pattern3, re_pattern3, marks=pytest.mark.xfail()),
(pattern4, re_pattern4),
(pattern5, re_pattern5)])
pytest.param(pattern5, re_pattern5, marks=pytest.mark.xfail()),
],
)
def test_greedy_matching(doc, text, pattern, re_pattern):
"""Test that the greedy matching behavior of the * op is consistant with
other re implementations."""
@ -50,12 +61,16 @@ def test_greedy_matching(doc, text, pattern, re_pattern):
@pytest.mark.xfail
@pytest.mark.parametrize('pattern,re_pattern', [
@pytest.mark.parametrize(
"pattern,re_pattern",
[
(pattern1, re_pattern1),
(pattern2, re_pattern2),
(pattern3, re_pattern3),
(pattern4, re_pattern4),
(pattern5, re_pattern5)])
(pattern5, re_pattern5),
],
)
def test_match_consuming(doc, text, pattern, re_pattern):
"""Test that matcher.__call__ consumes tokens on a match similar to
re.findall."""
@ -68,33 +83,33 @@ def test_match_consuming(doc, text, pattern, re_pattern):
def test_operator_combos(en_vocab):
cases = [
('aaab', 'a a a b', True),
('aaab', 'a+ b', True),
('aaab', 'a+ a+ b', True),
('aaab', 'a+ a+ a b', True),
('aaab', 'a+ a+ a+ b', True),
('aaab', 'a+ a a b', True),
('aaab', 'a+ a a', True),
('aaab', 'a+', True),
('aaa', 'a+ b', False),
('aaa', 'a+ a+ b', False),
('aaa', 'a+ a+ a+ b', False),
('aaa', 'a+ a b', False),
('aaa', 'a+ a a b', False),
('aaab', 'a+ a a', True),
('aaab', 'a+', True),
('aaab', 'a+ a b', True)
("aaab", "a a a b", True),
("aaab", "a+ b", True),
("aaab", "a+ a+ b", True),
("aaab", "a+ a+ a b", True),
("aaab", "a+ a+ a+ b", True),
("aaab", "a+ a a b", True),
("aaab", "a+ a a", True),
("aaab", "a+", True),
("aaa", "a+ b", False),
("aaa", "a+ a+ b", False),
("aaa", "a+ a+ a+ b", False),
("aaa", "a+ a b", False),
("aaa", "a+ a a b", False),
("aaab", "a+ a a", True),
("aaab", "a+", True),
("aaab", "a+ a b", True),
]
for string, pattern_str, result in cases:
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=list(string))
pattern = []
for part in pattern_str.split():
if part.endswith('+'):
pattern.append({'ORTH': part[0], 'OP': '+'})
if part.endswith("+"):
pattern.append({"ORTH": part[0], "OP": "+"})
else:
pattern.append({'ORTH': part})
matcher.add('PATTERN', None, pattern)
pattern.append({"ORTH": part})
matcher.add("PATTERN", None, pattern)
matches = matcher(doc)
if result:
assert matches, (string, pattern_str)
@ -105,12 +120,12 @@ def test_operator_combos(en_vocab):
def test_matcher_end_zero_plus(en_vocab):
"""Test matcher works when patterns end with * operator. (issue 1450)"""
matcher = Matcher(en_vocab)
pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]
matcher.add('TSTEND', None, pattern)
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
matcher.add("TSTEND", None, pattern)
nlp = lambda string: Doc(matcher.vocab, words=string.split())
assert len(matcher(nlp('a'))) == 1
assert len(matcher(nlp('a b'))) == 2
assert len(matcher(nlp('a c'))) == 1
assert len(matcher(nlp('a b c'))) == 2
assert len(matcher(nlp('a b b c'))) == 3
assert len(matcher(nlp('a b b'))) == 3
assert len(matcher(nlp("a"))) == 1
assert len(matcher(nlp("a b"))) == 2
assert len(matcher(nlp("a c"))) == 1
assert len(matcher(nlp("a b c"))) == 2
assert len(matcher(nlp("a b b c"))) == 3
assert len(matcher(nlp("a b b"))) == 3

View File

@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc
@ -11,7 +10,7 @@ from ..util import get_doc
def test_matcher_phrase_matcher(en_vocab):
doc = Doc(en_vocab, words=["Google", "Now"])
matcher = PhraseMatcher(en_vocab)
matcher.add('COMPANY', None, doc)
matcher.add("COMPANY", None, doc)
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
assert len(matcher(doc)) == 1
@ -19,63 +18,63 @@ def test_matcher_phrase_matcher(en_vocab):
def test_phrase_matcher_length(en_vocab):
matcher = PhraseMatcher(en_vocab)
assert len(matcher) == 0
matcher.add('TEST', None, Doc(en_vocab, words=['test']))
matcher.add("TEST", None, Doc(en_vocab, words=["test"]))
assert len(matcher) == 1
matcher.add('TEST2', None, Doc(en_vocab, words=['test2']))
matcher.add("TEST2", None, Doc(en_vocab, words=["test2"]))
assert len(matcher) == 2
def test_phrase_matcher_contains(en_vocab):
matcher = PhraseMatcher(en_vocab)
matcher.add('TEST', None, Doc(en_vocab, words=['test']))
assert 'TEST' in matcher
assert 'TEST2' not in matcher
matcher.add("TEST", None, Doc(en_vocab, words=["test"]))
assert "TEST" in matcher
assert "TEST2" not in matcher
def test_phrase_matcher_string_attrs(en_vocab):
words1 = ['I', 'like', 'cats']
pos1 = ['PRON', 'VERB', 'NOUN']
words2 = ['Yes', ',', 'you', 'hate', 'dogs', 'very', 'much']
pos2 = ['INTJ', 'PUNCT', 'PRON', 'VERB', 'NOUN', 'ADV', 'ADV']
words1 = ["I", "like", "cats"]
pos1 = ["PRON", "VERB", "NOUN"]
words2 = ["Yes", ",", "you", "hate", "dogs", "very", "much"]
pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
pattern = get_doc(en_vocab, words=words1, pos=pos1)
matcher = PhraseMatcher(en_vocab, attr='POS')
matcher.add('TEST', None, pattern)
matcher = PhraseMatcher(en_vocab, attr="POS")
matcher.add("TEST", None, pattern)
doc = get_doc(en_vocab, words=words2, pos=pos2)
matches = matcher(doc)
assert len(matches) == 1
match_id, start, end = matches[0]
assert match_id == en_vocab.strings['TEST']
assert match_id == en_vocab.strings["TEST"]
assert start == 2
assert end == 5
def test_phrase_matcher_string_attrs_negative(en_vocab):
"""Test that token with the control codes as ORTH are *not* matched."""
words1 = ['I', 'like', 'cats']
pos1 = ['PRON', 'VERB', 'NOUN']
words2 = ['matcher:POS-PRON', 'matcher:POS-VERB', 'matcher:POS-NOUN']
pos2 = ['X', 'X', 'X']
words1 = ["I", "like", "cats"]
pos1 = ["PRON", "VERB", "NOUN"]
words2 = ["matcher:POS-PRON", "matcher:POS-VERB", "matcher:POS-NOUN"]
pos2 = ["X", "X", "X"]
pattern = get_doc(en_vocab, words=words1, pos=pos1)
matcher = PhraseMatcher(en_vocab, attr='POS')
matcher.add('TEST', None, pattern)
matcher = PhraseMatcher(en_vocab, attr="POS")
matcher.add("TEST", None, pattern)
doc = get_doc(en_vocab, words=words2, pos=pos2)
matches = matcher(doc)
assert len(matches) == 0
def test_phrase_matcher_bool_attrs(en_vocab):
words1 = ['Hello', 'world', '!']
words2 = ['No', 'problem', ',', 'he', 'said', '.']
words1 = ["Hello", "world", "!"]
words2 = ["No", "problem", ",", "he", "said", "."]
pattern = Doc(en_vocab, words=words1)
matcher = PhraseMatcher(en_vocab, attr='IS_PUNCT')
matcher.add('TEST', None, pattern)
matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT")
matcher.add("TEST", None, pattern)
doc = Doc(en_vocab, words=words2)
matches = matcher(doc)
assert len(matches) == 2
match_id1, start1, end1 = matches[0]
match_id2, start2, end2 = matches[1]
assert match_id1 == en_vocab.strings['TEST']
assert match_id2 == en_vocab.strings['TEST']
assert match_id1 == en_vocab.strings["TEST"]
assert match_id2 == en_vocab.strings["TEST"]
assert start1 == 0
assert end1 == 3
assert start2 == 3

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals
import pytest
import numpy.random
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps
from spacy.attrs import NORM
@ -20,18 +19,17 @@ def vocab():
@pytest.fixture
def parser(vocab):
parser = DependencyParser(vocab)
parser.cfg['token_vector_width'] = 8
parser.cfg['hidden_width'] = 30
parser.cfg['hist_size'] = 0
parser.add_label('left')
parser.cfg["token_vector_width"] = 8
parser.cfg["hidden_width"] = 30
parser.cfg["hist_size"] = 0
parser.add_label("left")
parser.begin_training([], **parser.cfg)
sgd = Adam(NumpyOps(), 0.001)
for i in range(10):
losses = {}
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
gold = GoldParse(doc, heads=[1, 1, 3, 3],
deps=['left', 'ROOT', 'left', 'ROOT'])
doc = Doc(vocab, words=["a", "b", "c", "d"])
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
parser.update([doc], [gold], sgd=sgd, losses=losses)
return parser
@ -44,29 +42,30 @@ def test_init_parser(parser):
# TODO: This now seems to be implicated in segfaults. Not sure what's up!
@pytest.mark.skip
def test_add_label(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc = parser(doc)
assert doc[0].head.i == 1
assert doc[0].dep_ == 'left'
assert doc[0].dep_ == "left"
assert doc[1].head.i == 1
assert doc[2].head.i == 3
assert doc[2].head.i == 3
parser.add_label('right')
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
parser.add_label("right")
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc = parser(doc)
assert doc[0].head.i == 1
assert doc[0].dep_ == 'left'
assert doc[0].dep_ == "left"
assert doc[1].head.i == 1
assert doc[2].head.i == 3
assert doc[2].head.i == 3
sgd = Adam(NumpyOps(), 0.001)
for i in range(10):
losses = {}
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
gold = GoldParse(doc, heads=[1, 1, 3, 3],
deps=['right', 'ROOT', 'left', 'ROOT'])
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = GoldParse(
doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"]
)
parser.update([doc], [gold], sgd=sgd, losses=losses)
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc = parser(doc)
assert doc[0].dep_ == 'right'
assert doc[2].dep_ == 'left'
assert doc[0].dep_ == "right"
assert doc[2].dep_ == "left"

View File

@ -31,16 +31,19 @@ def get_sequence_costs(M, words, heads, deps, transitions):
def vocab():
return Vocab()
@pytest.fixture
def arc_eager(vocab):
moves = ArcEager(vocab.strings, ArcEager.get_actions())
moves.add_action(2, 'left')
moves.add_action(3, 'right')
moves.add_action(2, "left")
moves.add_action(3, "right")
return moves
@pytest.fixture
def words():
return ['a', 'b']
return ["a", "b"]
@pytest.fixture
def doc(words, vocab):
@ -48,19 +51,21 @@ def doc(words, vocab):
vocab = Vocab()
return Doc(vocab, words=list(words))
@pytest.fixture
def gold(doc, words):
if len(words) == 2:
return GoldParse(doc, words=['a', 'b'], heads=[0, 0], deps=['ROOT', 'right'])
return GoldParse(doc, words=["a", "b"], heads=[0, 0], deps=["ROOT", "right"])
else:
raise NotImplementedError
@pytest.mark.xfail
def test_oracle_four_words(arc_eager, vocab):
words = ['a', 'b', 'c', 'd']
words = ["a", "b", "c", "d"]
heads = [1, 1, 3, 3]
deps = ['left', 'ROOT', 'left', 'ROOT']
actions = ['L-left', 'B-ROOT', 'L-left']
deps = ["left", "ROOT", "left", "ROOT"]
actions = ["L-left", "B-ROOT", "L-left"]
state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions)
assert state.is_final()
for i, state_costs in enumerate(cost_history):
@ -72,63 +77,65 @@ def test_oracle_four_words(arc_eager, vocab):
annot_tuples = [
(0, 'When', 'WRB', 11, 'advmod', 'O'),
(1, 'Walter', 'NNP', 2, 'compound', 'B-PERSON'),
(2, 'Rodgers', 'NNP', 11, 'nsubj', 'L-PERSON'),
(3, ',', ',', 2, 'punct', 'O'),
(4, 'our', 'PRP$', 6, 'poss', 'O'),
(5, 'embedded', 'VBN', 6, 'amod', 'O'),
(6, 'reporter', 'NN', 2, 'appos', 'O'),
(7, 'with', 'IN', 6, 'prep', 'O'),
(8, 'the', 'DT', 10, 'det', 'B-ORG'),
(9, '3rd', 'NNP', 10, 'compound', 'I-ORG'),
(10, 'Cavalry', 'NNP', 7, 'pobj', 'L-ORG'),
(11, 'says', 'VBZ', 44, 'advcl', 'O'),
(12, 'three', 'CD', 13, 'nummod', 'U-CARDINAL'),
(13, 'battalions', 'NNS', 16, 'nsubj', 'O'),
(14, 'of', 'IN', 13, 'prep', 'O'),
(15, 'troops', 'NNS', 14, 'pobj', 'O'),
(16, 'are', 'VBP', 11, 'ccomp', 'O'),
(17, 'on', 'IN', 16, 'prep', 'O'),
(18, 'the', 'DT', 19, 'det', 'O'),
(19, 'ground', 'NN', 17, 'pobj', 'O'),
(20, ',', ',', 17, 'punct', 'O'),
(21, 'inside', 'IN', 17, 'prep', 'O'),
(22, 'Baghdad', 'NNP', 21, 'pobj', 'U-GPE'),
(23, 'itself', 'PRP', 22, 'appos', 'O'),
(24, ',', ',', 16, 'punct', 'O'),
(25, 'have', 'VBP', 26, 'aux', 'O'),
(26, 'taken', 'VBN', 16, 'dep', 'O'),
(27, 'up', 'RP', 26, 'prt', 'O'),
(28, 'positions', 'NNS', 26, 'dobj', 'O'),
(29, 'they', 'PRP', 31, 'nsubj', 'O'),
(30, "'re", 'VBP', 31, 'aux', 'O'),
(31, 'going', 'VBG', 26, 'parataxis', 'O'),
(32, 'to', 'TO', 33, 'aux', 'O'),
(33, 'spend', 'VB', 31, 'xcomp', 'O'),
(34, 'the', 'DT', 35, 'det', 'B-TIME'),
(35, 'night', 'NN', 33, 'dobj', 'L-TIME'),
(36, 'there', 'RB', 33, 'advmod', 'O'),
(37, 'presumably', 'RB', 33, 'advmod', 'O'),
(38, ',', ',', 44, 'punct', 'O'),
(39, 'how', 'WRB', 40, 'advmod', 'O'),
(40, 'many', 'JJ', 41, 'amod', 'O'),
(41, 'soldiers', 'NNS', 44, 'pobj', 'O'),
(42, 'are', 'VBP', 44, 'aux', 'O'),
(43, 'we', 'PRP', 44, 'nsubj', 'O'),
(44, 'talking', 'VBG', 44, 'ROOT', 'O'),
(45, 'about', 'IN', 44, 'prep', 'O'),
(46, 'right', 'RB', 47, 'advmod', 'O'),
(47, 'now', 'RB', 44, 'advmod', 'O'),
(48, '?', '.', 44, 'punct', 'O')]
(0, "When", "WRB", 11, "advmod", "O"),
(1, "Walter", "NNP", 2, "compound", "B-PERSON"),
(2, "Rodgers", "NNP", 11, "nsubj", "L-PERSON"),
(3, ",", ",", 2, "punct", "O"),
(4, "our", "PRP$", 6, "poss", "O"),
(5, "embedded", "VBN", 6, "amod", "O"),
(6, "reporter", "NN", 2, "appos", "O"),
(7, "with", "IN", 6, "prep", "O"),
(8, "the", "DT", 10, "det", "B-ORG"),
(9, "3rd", "NNP", 10, "compound", "I-ORG"),
(10, "Cavalry", "NNP", 7, "pobj", "L-ORG"),
(11, "says", "VBZ", 44, "advcl", "O"),
(12, "three", "CD", 13, "nummod", "U-CARDINAL"),
(13, "battalions", "NNS", 16, "nsubj", "O"),
(14, "of", "IN", 13, "prep", "O"),
(15, "troops", "NNS", 14, "pobj", "O"),
(16, "are", "VBP", 11, "ccomp", "O"),
(17, "on", "IN", 16, "prep", "O"),
(18, "the", "DT", 19, "det", "O"),
(19, "ground", "NN", 17, "pobj", "O"),
(20, ",", ",", 17, "punct", "O"),
(21, "inside", "IN", 17, "prep", "O"),
(22, "Baghdad", "NNP", 21, "pobj", "U-GPE"),
(23, "itself", "PRP", 22, "appos", "O"),
(24, ",", ",", 16, "punct", "O"),
(25, "have", "VBP", 26, "aux", "O"),
(26, "taken", "VBN", 16, "dep", "O"),
(27, "up", "RP", 26, "prt", "O"),
(28, "positions", "NNS", 26, "dobj", "O"),
(29, "they", "PRP", 31, "nsubj", "O"),
(30, "'re", "VBP", 31, "aux", "O"),
(31, "going", "VBG", 26, "parataxis", "O"),
(32, "to", "TO", 33, "aux", "O"),
(33, "spend", "VB", 31, "xcomp", "O"),
(34, "the", "DT", 35, "det", "B-TIME"),
(35, "night", "NN", 33, "dobj", "L-TIME"),
(36, "there", "RB", 33, "advmod", "O"),
(37, "presumably", "RB", 33, "advmod", "O"),
(38, ",", ",", 44, "punct", "O"),
(39, "how", "WRB", 40, "advmod", "O"),
(40, "many", "JJ", 41, "amod", "O"),
(41, "soldiers", "NNS", 44, "pobj", "O"),
(42, "are", "VBP", 44, "aux", "O"),
(43, "we", "PRP", 44, "nsubj", "O"),
(44, "talking", "VBG", 44, "ROOT", "O"),
(45, "about", "IN", 44, "prep", "O"),
(46, "right", "RB", 47, "advmod", "O"),
(47, "now", "RB", 44, "advmod", "O"),
(48, "?", ".", 44, "punct", "O"),
]
def test_get_oracle_actions():
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
parser = DependencyParser(doc.vocab)
parser.moves.add_action(0, '')
parser.moves.add_action(1, '')
parser.moves.add_action(1, '')
parser.moves.add_action(4, 'ROOT')
parser.moves.add_action(0, "")
parser.moves.add_action(1, "")
parser.moves.add_action(1, "")
parser.moves.add_action(4, "ROOT")
for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
if head > i:
parser.moves.add_action(2, dep)

View File

@ -16,15 +16,17 @@ def vocab():
@pytest.fixture
def doc(vocab):
return Doc(vocab, words=['Casey', 'went', 'to', 'New', 'York', '.'])
return Doc(vocab, words=["Casey", "went", "to", "New", "York", "."])
@pytest.fixture
def entity_annots(doc):
casey = doc[0:1]
ny = doc[3:5]
return [(casey.start_char, casey.end_char, 'PERSON'),
(ny.start_char, ny.end_char, 'GPE')]
return [
(casey.start_char, casey.end_char, "PERSON"),
(ny.start_char, ny.end_char, "GPE"),
]
@pytest.fixture
@ -43,32 +45,33 @@ def test_get_oracle_moves(tsys, doc, entity_annots):
tsys.preprocess_gold(gold)
act_classes = tsys.get_oracle_sequence(doc, gold)
names = [tsys.get_class_name(act) for act in act_classes]
assert names == ['U-PERSON', 'O', 'O', 'B-GPE', 'L-GPE', 'O']
assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"]
def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
entity_annots = [(s, e, '!' + label) for s, e, label in entity_annots]
entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots]
gold = GoldParse(doc, entities=entity_annots)
for i, tag in enumerate(gold.ner):
if tag == 'L-!GPE':
gold.ner[i] = '-'
if tag == "L-!GPE":
gold.ner[i] = "-"
tsys.preprocess_gold(gold)
act_classes = tsys.get_oracle_sequence(doc, gold)
names = [tsys.get_class_name(act) for act in act_classes]
def test_get_oracle_moves_negative_entities2(tsys, vocab):
doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
doc = Doc(vocab, words=["A", "B", "C", "D"])
gold = GoldParse(doc, entities=[])
gold.ner = ['B-!PERSON', 'L-!PERSON', 'B-!PERSON', 'L-!PERSON']
gold.ner = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"]
tsys.preprocess_gold(gold)
act_classes = tsys.get_oracle_sequence(doc, gold)
names = [tsys.get_class_name(act) for act in act_classes]
def test_get_oracle_moves_negative_O(tsys, vocab):
doc = Doc(vocab, words=['A', 'B', 'C', 'D'])
doc = Doc(vocab, words=["A", "B", "C", "D"])
gold = GoldParse(doc, entities=[])
gold.ner = ['O', '!O', 'O', '!O']
gold.ner = ["O", "!O", "O", "!O"]
tsys.preprocess_gold(gold)
act_classes = tsys.get_oracle_sequence(doc, gold)
names = [tsys.get_class_name(act) for act in act_classes]
@ -80,8 +83,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
ner.begin_training([])
ner(doc)
assert len(list(doc.ents)) == 0
assert [w.ent_iob_ for w in doc] == (['O'] * len(doc))
doc.ents = [(doc.vocab.strings['ANIMAL'], 3, 4)]
assert [w.ent_iob_ for w in doc] == ['', '', '', 'B']
doc.ents = [(doc.vocab.strings['WORD'], 0, 2)]
assert [w.ent_iob_ for w in doc] == ['B', 'I', '', '']
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]

View File

@ -17,7 +17,7 @@ def vocab():
@pytest.fixture
def arc_eager(vocab):
actions = ArcEager.get_actions(left_labels=['L'], right_labels=['R'])
actions = ArcEager.get_actions(left_labels=["L"], right_labels=["R"])
return ArcEager(vocab.strings, actions)
@ -30,6 +30,7 @@ def tok2vec():
def parser(vocab, arc_eager):
return Parser(vocab, moves=arc_eager, model=None)
@pytest.fixture
def model(arc_eager, tok2vec):
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
@ -37,12 +38,12 @@ def model(arc_eager, tok2vec):
@pytest.fixture
def doc(vocab):
return Doc(vocab, words=['a', 'b', 'c'])
return Doc(vocab, words=["a", "b", "c"])
@pytest.fixture
def gold(doc):
return GoldParse(doc, heads=[1, 1, 1], deps=['L', 'ROOT', 'R'])
return GoldParse(doc, heads=[1, 1, 1], deps=["L", "ROOT", "R"])
def test_can_init_nn_parser(parser):
@ -62,8 +63,10 @@ def test_predict_doc(parser, tok2vec, model, doc):
def test_update_doc(parser, model, doc, gold):
parser.model = model
def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient
parser.update([doc], [gold], sgd=optimize)
@ -76,6 +79,8 @@ def test_predict_doc_beam(parser, model, doc):
@pytest.mark.xfail
def test_update_doc_beam(parser, model, doc, gold):
parser.model = model
def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient
parser.update_beam([doc], [gold], sgd=optimize)

View File

@ -21,20 +21,22 @@ def vocab():
@pytest.fixture
def moves(vocab):
aeager = ArcEager(vocab.strings, {})
aeager.add_action(2, 'nsubj')
aeager.add_action(3, 'dobj')
aeager.add_action(2, 'aux')
aeager.add_action(2, "nsubj")
aeager.add_action(3, "dobj")
aeager.add_action(2, "aux")
return aeager
@pytest.fixture
def docs(vocab):
return [Doc(vocab, words=['Rats', 'bite', 'things'])]
return [Doc(vocab, words=["Rats", "bite", "things"])]
@pytest.fixture
def states(docs):
return [StateClass(doc) for doc in docs]
@pytest.fixture
def tokvecs(docs, vector_size):
output = []
@ -73,9 +75,10 @@ def beam(moves, states, golds, beam_width):
def scores(moves, batch_size, beam_width):
return [
numpy.asarray(
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)),
dtype='f')
for _ in range(batch_size)]
numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), dtype="f"
)
for _ in range(batch_size)
]
def test_create_beam(beam):
@ -93,8 +96,8 @@ def test_beam_advance_too_few_scores(beam, scores):
def test_beam_parse():
nlp = Language()
nlp.add_pipe(DependencyParser(nlp.vocab), name='parser')
nlp.parser.add_label('nsubj')
nlp.add_pipe(DependencyParser(nlp.vocab), name="parser")
nlp.parser.add_label("nsubj")
nlp.parser.begin_training([], token_vector_width=8, hidden_width=8)
doc = nlp.make_doc('Australia is a country')
doc = nlp.make_doc("Australia is a country")
nlp.parser(doc, beam_width=2)

View File

@ -40,106 +40,116 @@ def multirooted_tree():
def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
assert([a for a in ancestors(3, tree)] == [4, 5, 2])
assert([a for a in ancestors(3, cyclic_tree)] == [4, 5, 3, 4, 5, 3, 4])
assert([a for a in ancestors(3, partial_tree)] == [4, 5, None])
assert([a for a in ancestors(17, multirooted_tree)] == [])
assert [a for a in ancestors(3, tree)] == [4, 5, 2]
assert [a for a in ancestors(3, cyclic_tree)] == [4, 5, 3, 4, 5, 3, 4]
assert [a for a in ancestors(3, partial_tree)] == [4, 5, None]
assert [a for a in ancestors(17, multirooted_tree)] == []
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
assert(contains_cycle(tree) == None)
assert(contains_cycle(cyclic_tree) == set([3, 4, 5]))
assert(contains_cycle(partial_tree) == None)
assert(contains_cycle(multirooted_tree) == None)
assert contains_cycle(tree) == None
assert contains_cycle(cyclic_tree) == set([3, 4, 5])
assert contains_cycle(partial_tree) == None
assert contains_cycle(multirooted_tree) == None
def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree):
assert(is_nonproj_arc(0, nonproj_tree) == False)
assert(is_nonproj_arc(1, nonproj_tree) == False)
assert(is_nonproj_arc(2, nonproj_tree) == False)
assert(is_nonproj_arc(3, nonproj_tree) == False)
assert(is_nonproj_arc(4, nonproj_tree) == False)
assert(is_nonproj_arc(5, nonproj_tree) == False)
assert(is_nonproj_arc(6, nonproj_tree) == False)
assert(is_nonproj_arc(7, nonproj_tree) == True)
assert(is_nonproj_arc(8, nonproj_tree) == False)
assert(is_nonproj_arc(7, partial_tree) == False)
assert(is_nonproj_arc(17, multirooted_tree) == False)
assert(is_nonproj_arc(16, multirooted_tree) == True)
assert is_nonproj_arc(0, nonproj_tree) == False
assert is_nonproj_arc(1, nonproj_tree) == False
assert is_nonproj_arc(2, nonproj_tree) == False
assert is_nonproj_arc(3, nonproj_tree) == False
assert is_nonproj_arc(4, nonproj_tree) == False
assert is_nonproj_arc(5, nonproj_tree) == False
assert is_nonproj_arc(6, nonproj_tree) == False
assert is_nonproj_arc(7, nonproj_tree) == True
assert is_nonproj_arc(8, nonproj_tree) == False
assert is_nonproj_arc(7, partial_tree) == False
assert is_nonproj_arc(17, multirooted_tree) == False
assert is_nonproj_arc(16, multirooted_tree) == True
def test_parser_is_nonproj_tree(proj_tree, nonproj_tree, partial_tree, multirooted_tree):
assert(is_nonproj_tree(proj_tree) == False)
assert(is_nonproj_tree(nonproj_tree) == True)
assert(is_nonproj_tree(partial_tree) == False)
assert(is_nonproj_tree(multirooted_tree) == True)
def test_parser_is_nonproj_tree(
proj_tree, nonproj_tree, partial_tree, multirooted_tree
):
assert is_nonproj_tree(proj_tree) == False
assert is_nonproj_tree(nonproj_tree) == True
assert is_nonproj_tree(partial_tree) == False
assert is_nonproj_tree(multirooted_tree) == True
def test_parser_pseudoprojectivity(en_tokenizer):
def deprojectivize(proj_heads, deco_labels):
tokens = en_tokenizer('whatever ' * len(proj_heads))
rel_proj_heads = [head-i for i, head in enumerate(proj_heads)]
doc = get_doc(tokens.vocab, words=[t.text for t in tokens],
deps=deco_labels, heads=rel_proj_heads)
tokens = en_tokenizer("whatever " * len(proj_heads))
rel_proj_heads = [head - i for i, head in enumerate(proj_heads)]
doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
deps=deco_labels,
heads=rel_proj_heads,
)
nonproj.deprojectivize(doc)
return [t.head.i for t in doc], [token.dep_ for token in doc]
# fmt: off
tree = [1, 2, 2]
nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2]
nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
labels = ['det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj', 'acl', 'punct']
labels2 = ['advmod', 'root', 'det', 'nsubj', 'advmod', 'det', 'dobj', 'det', 'nmod', 'aux', 'nmod', 'advmod', 'det', 'amod', 'punct']
labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"]
labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"]
# fmt: on
assert(nonproj.decompose('X||Y') == ('X','Y'))
assert(nonproj.decompose('X') == ('X',''))
assert(nonproj.is_decorated('X||Y') == True)
assert(nonproj.is_decorated('X') == False)
assert nonproj.decompose("X||Y") == ("X", "Y")
assert nonproj.decompose("X") == ("X", "")
assert nonproj.is_decorated("X||Y") == True
assert nonproj.is_decorated("X") == False
nonproj._lift(0, tree)
assert(tree == [2, 2, 2])
assert tree == [2, 2, 2]
assert(nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7)
assert(nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10)
assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7
assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10
# fmt: off
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels)
assert(proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2])
assert(deco_labels == ['det', 'nsubj', 'root', 'det', 'dobj', 'aux',
'nsubj', 'acl||dobj', 'punct'])
assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]
assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
"nsubj", "acl||dobj", "punct"]
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
assert(deproj_heads == nonproj_tree)
assert(undeco_labels == labels)
assert deproj_heads == nonproj_tree
assert undeco_labels == labels
proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2)
assert(proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1])
assert(deco_labels == ['advmod||aux', 'root', 'det', 'nsubj', 'advmod',
'det', 'dobj', 'det', 'nmod', 'aux', 'nmod||dobj',
'advmod', 'det', 'amod', 'punct'])
assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
assert deco_labels == ["advmod||aux", "root", "det", "nsubj", "advmod",
"det", "dobj", "det", "nmod", "aux", "nmod||dobj",
"advmod", "det", "amod", "punct"]
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
assert(deproj_heads == nonproj_tree2)
assert(undeco_labels == labels2)
assert deproj_heads == nonproj_tree2
assert undeco_labels == labels2
# if decoration is wrong such that there is no head with the desired label
# the structure is kept and the label is undecorated
proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2]
deco_labels = ['det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj',
'acl||iobj', 'punct']
deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj",
"acl||iobj", "punct"]
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
assert(deproj_heads == proj_heads)
assert(undeco_labels == ['det', 'nsubj', 'root', 'det', 'dobj', 'aux',
'nsubj', 'acl', 'punct'])
assert deproj_heads == proj_heads
assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux",
"nsubj", "acl", "punct"]
# if there are two potential new heads, the first one is chosen even if
# it's wrong
# it"s wrong
proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]
deco_labels = ['advmod||aux', 'root', 'det', 'aux', 'advmod', 'det',
'dobj', 'det', 'nmod', 'aux', 'nmod||dobj', 'advmod',
'det', 'amod', 'punct']
deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det",
"dobj", "det", "nmod", "aux", "nmod||dobj", "advmod",
"det", "amod", "punct"]
deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels)
assert(deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1])
assert(undeco_labels == ['advmod', 'root', 'det', 'aux', 'advmod', 'det',
'dobj', 'det', 'nmod', 'aux', 'nmod', 'advmod',
'det', 'amod', 'punct'])
assert deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]
assert undeco_labels == ["advmod", "root", "det", "aux", "advmod", "det",
"dobj", "det", "nmod", "aux", "nmod", "advmod",
"det", "amod", "punct"]
# fmt: on

View File

@ -9,7 +9,7 @@ from ..util import get_doc, apply_transition_sequence
def test_parser_root(en_tokenizer):
text = "i don't have other assistance"
heads = [3, 2, 1, 0, 1, -2]
deps = ['nsubj', 'aux', 'neg', 'ROOT', 'amod', 'dobj']
deps = ["nsubj", "aux", "neg", "ROOT", "amod", "dobj"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
for t in doc:
@ -17,10 +17,12 @@ def test_parser_root(en_tokenizer):
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["Hello"])
@pytest.mark.parametrize("text", ["Hello"])
def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=['ROOT'])
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
)
assert len(doc) == 1
with en_parser.step_through(doc) as _:
@ -32,7 +34,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
def test_parser_initial(en_tokenizer, en_parser):
text = "I ate the pizza with anchovies."
heads = [1, 0, 1, -2, -3, -1, -5]
transition = ['L-nsubj', 'S', 'L-det']
transition = ["L-nsubj", "S", "L-det"]
tokens = en_tokenizer(text)
apply_transition_sequence(en_parser, tokens, transition)
assert tokens[0].head.i == 1
@ -58,17 +60,19 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser):
def test_parser_merge_pp(en_tokenizer):
text = "A phrase with another phrase occurs"
heads = [1, 4, -1, 1, -2, 0]
deps = ['det', 'nsubj', 'prep', 'det', 'pobj', 'ROOT']
tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ']
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"]
tags = ["DT", "NN", "IN", "DT", "NN", "VBZ"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags
)
nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks]
for start, end, lemma in nps:
doc.merge(start, end, label='NP', lemma=lemma)
assert doc[0].text == 'A phrase'
assert doc[1].text == 'with'
assert doc[2].text == 'another phrase'
assert doc[3].text == 'occurs'
doc.merge(start, end, label="NP", lemma=lemma)
assert doc[0].text == "A phrase"
assert doc[1].text == "with"
assert doc[2].text == "another phrase"
assert doc[3].text == "occurs"
@pytest.mark.xfail
@ -76,7 +80,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
text = "a b c d e"
# right branching
transition = ['R-nsubj', 'D', 'R-nsubj', 'R-nsubj', 'D', 'R-ROOT']
transition = ["R-nsubj", "D", "R-nsubj", "R-nsubj", "D", "R-ROOT"]
tokens = en_tokenizer(text)
apply_transition_sequence(en_parser, tokens, transition)
@ -111,7 +115,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser):
assert tokens[4].head.i == 2
# left branching
transition = ['S', 'S', 'S', 'L-nsubj','L-nsubj','L-nsubj', 'L-nsubj']
transition = ["S", "S", "S", "L-nsubj", "L-nsubj", "L-nsubj", "L-nsubj"]
tokens = en_tokenizer(text)
apply_transition_sequence(en_parser, tokens, transition)

View File

@ -33,6 +33,7 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
@pytest.fixture
def heads():
# fmt: off
return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15,
-1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
-4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14,
@ -50,6 +51,7 @@ def heads():
0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1,
-1, -8, -9, -1]
# fmt: on
def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
@ -100,7 +102,14 @@ def test_parser_parse_navigate_edges(en_tokenizer, text, heads):
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
for token in doc:
subtree = list(token.subtree)
debug = '\t'.join((token.text, token.left_edge.text, subtree[0].text))
debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
assert token.left_edge == subtree[0], debug
debug = '\t'.join((token.text, token.right_edge.text, subtree[-1].text, token.right_edge.head.text))
debug = "\t".join(
(
token.text,
token.right_edge.text,
subtree[-1].text,
token.right_edge.head.text,
)
)
assert token.right_edge == subtree[-1], debug

View File

@ -19,34 +19,33 @@ def vocab():
@pytest.fixture
def parser(vocab):
parser = DependencyParser(vocab)
parser.cfg['token_vector_width'] = 4
parser.cfg['hidden_width'] = 32
#parser.add_label('right')
parser.add_label('left')
parser.cfg["token_vector_width"] = 4
parser.cfg["hidden_width"] = 32
# parser.add_label('right')
parser.add_label("left")
parser.begin_training([], **parser.cfg)
sgd = Adam(NumpyOps(), 0.001)
for i in range(10):
losses = {}
doc = Doc(vocab, words=['a', 'b', 'c', 'd'])
gold = GoldParse(doc, heads=[1, 1, 3, 3],
deps=['left', 'ROOT', 'left', 'ROOT'])
doc = Doc(vocab, words=["a", "b", "c", "d"])
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
parser.update([doc], [gold], sgd=sgd, losses=losses)
return parser
def test_no_sentences(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc = parser(doc)
assert len(list(doc.sents)) >= 1
def test_sents_1(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc[2].sent_start = True
doc = parser(doc)
assert len(list(doc.sents)) >= 2
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc[1].sent_start = False
doc[2].sent_start = True
doc[3].sent_start = False
@ -55,7 +54,7 @@ def test_sents_1(parser):
def test_sents_1_2(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc[1].sent_start = True
doc[2].sent_start = True
doc = parser(doc)
@ -63,12 +62,12 @@ def test_sents_1_2(parser):
def test_sents_1_3(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc[1].sent_start = True
doc[3].sent_start = True
doc = parser(doc)
assert len(list(doc.sents)) >= 3
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc[1].sent_start = True
doc[2].sent_start = False
doc[3].sent_start = True

View File

@ -19,11 +19,13 @@ def test_parser_space_attachment(en_tokenizer):
def test_parser_sentence_space(en_tokenizer):
# fmt: off
text = "I look forward to using Thingamajig. I've been told it will make my life easier..."
heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]
deps = ['nsubj', 'ROOT', 'advmod', 'prep', 'pcomp', 'dobj', 'punct', '',
'nsubjpass', 'aux', 'auxpass', 'ROOT', 'nsubj', 'aux', 'ccomp',
'poss', 'nsubj', 'ccomp', 'punct']
deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "",
"nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp",
"poss", "nsubj", "ccomp", "punct"]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
assert len(list(doc.sents)) == 2
@ -34,10 +36,10 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
text = "\t \n This is a sentence ."
heads = [1, 1, 0, 1, -2, -3]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=text.split(' '), heads=heads)
doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
assert doc[0].is_space
assert doc[1].is_space
assert doc[2].text == 'This'
assert doc[2].text == "This"
with en_parser.step_through(doc) as stepwise:
pass
assert doc[0].head.i == 2
@ -49,9 +51,9 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser):
def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
text = "This is \t a \t\n \n sentence . \n\n \n"
heads = [1, 0, -1, 2, -1, -4, -5, -1]
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct']
transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=text.split(' '), heads=heads)
doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads)
assert doc[2].is_space
assert doc[4].is_space
assert doc[5].is_space
@ -64,8 +66,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
assert [token.head.i for token in doc] == [1, 1, 1, 6, 3, 3, 1, 1, 7, 7]
@pytest.mark.parametrize('text,length', [(['\n'], 1),
(['\n', '\t', '\n\n', '\t'], 4)])
@pytest.mark.parametrize("text,length", [(["\n"], 1), (["\n", "\t", "\n\n", "\t"], 4)])
@pytest.mark.xfail
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
doc = Doc(en_parser.vocab, words=text)
@ -74,4 +75,4 @@ def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
pass
assert doc[0].is_space
for token in doc:
assert token.head.i == length-1
assert token.head.i == length - 1

View File

@ -18,14 +18,16 @@ def patterns():
{"label": "HELLO", "pattern": "hello world"},
{"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
{"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}
{"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
]
@pytest.fixture
def add_ent():
def add_ent_component(doc):
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings['ORG'])]
doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])]
return doc
return add_ent_component
@ -33,13 +35,13 @@ def test_entity_ruler_init(nlp, patterns):
ruler = EntityRuler(nlp, patterns=patterns)
assert len(ruler) == len(patterns)
assert len(ruler.labels) == 3
assert 'HELLO' in ruler
assert 'BYE' in ruler
assert "HELLO" in ruler
assert "BYE" in ruler
nlp.add_pipe(ruler)
doc = nlp("hello world bye bye")
assert len(doc.ents) == 2
assert doc.ents[0].label_ == 'HELLO'
assert doc.ents[1].label_ == 'BYE'
assert doc.ents[0].label_ == "HELLO"
assert doc.ents[1].label_ == "BYE"
def test_entity_ruler_existing(nlp, patterns, add_ent):
@ -48,8 +50,8 @@ def test_entity_ruler_existing(nlp, patterns, add_ent):
nlp.add_pipe(ruler)
doc = nlp("OH HELLO WORLD bye bye")
assert len(doc.ents) == 2
assert doc.ents[0].label_ == 'ORG'
assert doc.ents[1].label_ == 'BYE'
assert doc.ents[0].label_ == "ORG"
assert doc.ents[1].label_ == "BYE"
def test_entity_ruler_existing_overwrite(nlp, patterns, add_ent):
@ -58,9 +60,9 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, add_ent):
nlp.add_pipe(ruler)
doc = nlp("OH HELLO WORLD bye bye")
assert len(doc.ents) == 2
assert doc.ents[0].label_ == 'HELLO'
assert doc.ents[0].text == 'HELLO'
assert doc.ents[1].label_ == 'BYE'
assert doc.ents[0].label_ == "HELLO"
assert doc.ents[0].text == "HELLO"
assert doc.ents[1].label_ == "BYE"
def test_entity_ruler_existing_complex(nlp, patterns, add_ent):
@ -69,8 +71,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, add_ent):
nlp.add_pipe(ruler)
doc = nlp("foo foo bye bye")
assert len(doc.ents) == 2
assert doc.ents[0].label_ == 'COMPLEX'
assert doc.ents[1].label_ == 'BYE'
assert doc.ents[0].label_ == "COMPLEX"
assert doc.ents[1].label_ == "BYE"
assert len(doc.ents[0]) == 2
assert len(doc.ents[1]) == 2

View File

@ -10,15 +10,21 @@ from ..util import get_doc
@pytest.fixture
def doc(en_tokenizer):
text = 'I like New York in Autumn.'
text = "I like New York in Autumn."
heads = [1, 0, 1, -2, -3, -1, -5]
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
pos = ['PRON', 'VERB', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT']
deps = ['ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."]
pos = ["PRON", "VERB", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"]
deps = ["ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads,
tags=tags, pos=pos, deps=deps)
doc.ents = [Span(doc, 2, 4, doc.vocab.strings['GPE'])]
doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
heads=heads,
tags=tags,
pos=pos,
deps=deps,
)
doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])]
doc.is_parsed = True
doc.is_tagged = True
return doc
@ -27,18 +33,18 @@ def doc(en_tokenizer):
def test_factories_merge_noun_chunks(doc):
assert len(doc) == 7
nlp = Language()
merge_noun_chunks = nlp.create_pipe('merge_noun_chunks')
merge_noun_chunks = nlp.create_pipe("merge_noun_chunks")
merge_noun_chunks(doc)
assert len(doc) == 6
assert doc[2].text == 'New York'
assert doc[2].text == "New York"
def test_factories_merge_ents(doc):
assert len(doc) == 7
assert len(list(doc.ents)) == 1
nlp = Language()
merge_entities = nlp.create_pipe('merge_entities')
merge_entities = nlp.create_pipe("merge_entities")
merge_entities(doc)
assert len(doc) == 6
assert len(list(doc.ents)) == 1
assert doc[2].text == 'New York'
assert doc[2].text == "New York"

View File

@ -16,22 +16,22 @@ def new_pipe(doc):
def test_add_pipe_no_name(nlp):
nlp.add_pipe(new_pipe)
assert 'new_pipe' in nlp.pipe_names
assert "new_pipe" in nlp.pipe_names
def test_add_pipe_duplicate_name(nlp):
nlp.add_pipe(new_pipe, name='duplicate_name')
nlp.add_pipe(new_pipe, name="duplicate_name")
with pytest.raises(ValueError):
nlp.add_pipe(new_pipe, name='duplicate_name')
nlp.add_pipe(new_pipe, name="duplicate_name")
@pytest.mark.parametrize('name', ['parser'])
@pytest.mark.parametrize("name", ["parser"])
def test_add_pipe_first(nlp, name):
nlp.add_pipe(new_pipe, name=name, first=True)
assert nlp.pipeline[0][0] == name
@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')])
@pytest.mark.parametrize("name1,name2", [("parser", "lambda_pipe")])
def test_add_pipe_last(nlp, name1, name2):
nlp.add_pipe(lambda doc: doc, name=name2)
nlp.add_pipe(new_pipe, name=name1, last=True)
@ -44,7 +44,7 @@ def test_cant_add_pipe_first_and_last(nlp):
nlp.add_pipe(new_pipe, first=True, last=True)
@pytest.mark.parametrize('name', ['my_component'])
@pytest.mark.parametrize("name", ["my_component"])
def test_get_pipe(nlp, name):
with pytest.raises(KeyError):
nlp.get_pipe(name)
@ -52,7 +52,7 @@ def test_get_pipe(nlp, name):
assert nlp.get_pipe(name) == new_pipe
@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)])
@pytest.mark.parametrize("name,replacement", [("my_component", lambda doc: doc)])
def test_replace_pipe(nlp, name, replacement):
with pytest.raises(ValueError):
nlp.replace_pipe(name, new_pipe)
@ -62,7 +62,7 @@ def test_replace_pipe(nlp, name, replacement):
assert nlp.get_pipe(name) == replacement
@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')])
@pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")])
def test_rename_pipe(nlp, old_name, new_name):
with pytest.raises(ValueError):
nlp.rename_pipe(old_name, new_name)
@ -71,7 +71,7 @@ def test_rename_pipe(nlp, old_name, new_name):
assert nlp.pipeline[0][0] == new_name
@pytest.mark.parametrize('name', ['my_component'])
@pytest.mark.parametrize("name", ["my_component"])
def test_remove_pipe(nlp, name):
with pytest.raises(ValueError):
nlp.remove_pipe(name)
@ -83,7 +83,7 @@ def test_remove_pipe(nlp, name):
assert removed_component == new_pipe
@pytest.mark.parametrize('name', ['my_component'])
@pytest.mark.parametrize("name", ["my_component"])
def test_disable_pipes_method(nlp, name):
nlp.add_pipe(new_pipe, name=name)
assert nlp.has_pipe(name)
@ -92,7 +92,7 @@ def test_disable_pipes_method(nlp, name):
disabled.restore()
@pytest.mark.parametrize('name', ['my_component'])
@pytest.mark.parametrize("name", ["my_component"])
def test_disable_pipes_context(nlp, name):
nlp.add_pipe(new_pipe, name=name)
assert nlp.has_pipe(name)
@ -101,14 +101,14 @@ def test_disable_pipes_context(nlp, name):
assert nlp.has_pipe(name)
@pytest.mark.parametrize('n_pipes', [100])
@pytest.mark.parametrize("n_pipes", [100])
def test_add_lots_of_pipes(nlp, n_pipes):
for i in range(n_pipes):
nlp.add_pipe(lambda doc: doc, name='pipe_%d' % i)
nlp.add_pipe(lambda doc: doc, name="pipe_%d" % i)
assert len(nlp.pipe_names) == n_pipes
@pytest.mark.parametrize('component', ['ner', {'hello': 'world'}])
@pytest.mark.parametrize("component", ["ner", {"hello": "world"}])
def test_raise_for_invalid_components(nlp, component):
with pytest.raises(ValueError):
nlp.add_pipe(component)

View File

@ -13,16 +13,21 @@ from spacy.gold import GoldParse
@pytest.mark.skip(reason="Test is flakey when run with others")
def test_simple_train():
nlp = Language()
nlp.add_pipe(nlp.create_pipe('textcat'))
nlp.get_pipe('textcat').add_label('answer')
nlp.add_pipe(nlp.create_pipe("textcat"))
nlp.get_pipe("textcat").add_label("answer")
nlp.begin_training()
for i in range(5):
for text, answer in [('aaaa', 1.), ('bbbb', 0), ('aa', 1.),
('bbbbbbbbb', 0.), ('aaaaaa', 1)]:
nlp.update([text], [{'cats': {'answer': answer}}])
doc = nlp('aaa')
assert 'answer' in doc.cats
assert doc.cats['answer'] >= 0.5
for text, answer in [
("aaaa", 1.0),
("bbbb", 0),
("aa", 1.0),
("bbbbbbbbb", 0.0),
("aaaaaa", 1),
]:
nlp.update([text], [{"cats": {"answer": answer}}])
doc = nlp("aaa")
assert "answer" in doc.cats
assert doc.cats["answer"] >= 0.5
@pytest.mark.skip(reason="Test is flakey when run with others")
@ -31,11 +36,11 @@ def test_textcat_learns_multilabel():
numpy.random.seed(5)
docs = []
nlp = Language()
letters = ['a', 'b', 'c']
letters = ["a", "b", "c"]
for w1 in letters:
for w2 in letters:
cats = {letter: float(w2==letter) for letter in letters}
docs.append((Doc(nlp.vocab, words=['d']*3 + [w1, w2] + ['d']*3), cats))
cats = {letter: float(w2 == letter) for letter in letters}
docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
random.shuffle(docs)
model = TextCategorizer(nlp.vocab, width=8)
for letter in letters:
@ -49,8 +54,8 @@ def test_textcat_learns_multilabel():
random.shuffle(docs)
for w1 in letters:
for w2 in letters:
doc = Doc(nlp.vocab, words=['d']*3 + [w1, w2] + ['d']*3)
truth = {letter: w2==letter for letter in letters}
doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
truth = {letter: w2 == letter for letter in letters}
model(doc)
for cat, score in doc.cats.items():
if not truth[cat]:

View File

@ -14,14 +14,20 @@ from spacy.tokens import Doc
from ..util import get_doc, make_tempdir
@pytest.mark.parametrize('patterns', [
[[{'LOWER': 'celtics'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
[[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'celtics'}]]])
@pytest.mark.parametrize(
"patterns",
[
[[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
[[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]],
],
)
def test_issue118(en_tokenizer, patterns):
"""Test a bug that arose from having overlapping matches"""
text = "how many points did lebron james score against the boston celtics last night"
text = (
"how many points did lebron james score against the boston celtics last night"
)
doc = en_tokenizer(text)
ORG = doc.vocab.strings['ORG']
ORG = doc.vocab.strings["ORG"]
matcher = Matcher(doc.vocab)
matcher.add("BostonCeltics", None, *patterns)
assert len(list(doc.ents)) == 0
@ -35,16 +41,22 @@ def test_issue118(en_tokenizer, patterns):
assert ents[0].end == 11
@pytest.mark.parametrize('patterns', [
[[{'LOWER': 'boston'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]],
[[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'boston'}]]])
@pytest.mark.parametrize(
"patterns",
[
[[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]],
[[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]],
],
)
def test_issue118_prefix_reorder(en_tokenizer, patterns):
"""Test a bug that arose from having overlapping matches"""
text = "how many points did lebron james score against the boston celtics last night"
text = (
"how many points did lebron james score against the boston celtics last night"
)
doc = en_tokenizer(text)
ORG = doc.vocab.strings['ORG']
ORG = doc.vocab.strings["ORG"]
matcher = Matcher(doc.vocab)
matcher.add('BostonCeltics', None, *patterns)
matcher.add("BostonCeltics", None, *patterns)
assert len(list(doc.ents)) == 0
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
doc.ents += tuple(matches)[1:]
@ -59,11 +71,13 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns):
def test_issue242(en_tokenizer):
"""Test overlapping multi-word phrases."""
text = "There are different food safety standards in different countries."
patterns = [[{'LOWER': 'food'}, {'LOWER': 'safety'}],
[{'LOWER': 'safety'}, {'LOWER': 'standards'}]]
patterns = [
[{"LOWER": "food"}, {"LOWER": "safety"}],
[{"LOWER": "safety"}, {"LOWER": "standards"}],
]
doc = en_tokenizer(text)
matcher = Matcher(doc.vocab)
matcher.add('FOOD', None, *patterns)
matcher.add("FOOD", None, *patterns)
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
doc.ents += tuple(matches)
@ -77,7 +91,9 @@ def test_issue242(en_tokenizer):
def test_issue309(en_tokenizer):
"""Test Issue #309: SBD fails on empty string"""
tokens = en_tokenizer(" ")
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=['ROOT'])
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"]
)
doc.is_parsed = True
assert len(doc) == 1
sents = list(doc.sents)
@ -93,11 +109,11 @@ def test_issue351(en_tokenizer):
def test_issue360(en_tokenizer):
"""Test tokenization of big ellipsis"""
tokens = en_tokenizer('$45...............Asking')
tokens = en_tokenizer("$45...............Asking")
assert len(tokens) > 2
@pytest.mark.parametrize('text1,text2', [("cat", "dog")])
@pytest.mark.parametrize("text1,text2", [("cat", "dog")])
def test_issue361(en_vocab, text1, text2):
"""Test Issue #361: Equality of lexemes"""
assert en_vocab[text1] == en_vocab[text1]
@ -106,15 +122,19 @@ def test_issue361(en_vocab, text1, text2):
def test_issue587(en_tokenizer):
"""Test that Matcher doesn't segfault on particular input"""
doc = en_tokenizer('a b; c')
doc = en_tokenizer("a b; c")
matcher = Matcher(doc.vocab)
matcher.add('TEST1', None, [{ORTH: 'a'}, {ORTH: 'b'}])
matcher.add("TEST1", None, [{ORTH: "a"}, {ORTH: "b"}])
matches = matcher(doc)
assert len(matches) == 1
matcher.add('TEST2', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}])
matcher.add(
"TEST2", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]
)
matches = matcher(doc)
assert len(matches) == 2
matcher.add('TEST3', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}])
matcher.add(
"TEST3", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]
)
matches = matcher(doc)
assert len(matches) == 2
@ -122,22 +142,26 @@ def test_issue587(en_tokenizer):
def test_issue588(en_vocab):
matcher = Matcher(en_vocab)
with pytest.raises(ValueError):
matcher.add('TEST', None, [])
matcher.add("TEST", None, [])
@pytest.mark.xfail
def test_issue589():
vocab = Vocab()
vocab.strings.set_frozen(True)
doc = Doc(vocab, words=['whata'])
doc = Doc(vocab, words=["whata"])
def test_issue590(en_vocab):
"""Test overlapping matches"""
doc = Doc(en_vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%'])
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
matcher = Matcher(en_vocab)
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': ':'}, {'LIKE_NUM': True}, {'ORTH': '%'}])
matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': '='}, {'LIKE_NUM': True}])
matcher.add(
"ab",
None,
[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}],
)
matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}])
matches = matcher(doc)
assert len(matches) == 2
@ -145,14 +169,14 @@ def test_issue590(en_vocab):
def test_issue595():
"""Test lemmatization of base forms"""
words = ["Do", "n't", "feed", "the", "dog"]
tag_map = {'VB': {POS: VERB, VerbForm_inf: True}}
tag_map = {"VB": {POS: VERB, VerbForm_inf: True}}
rules = {"verb": [["ed", "e"]]}
lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = Doc(vocab, words=words)
doc[2].tag_ = 'VB'
assert doc[2].text == 'feed'
assert doc[2].lemma_ == 'feed'
doc[2].tag_ = "VB"
assert doc[2].text == "feed"
assert doc[2].lemma_ == "feed"
def test_issue599(en_vocab):
@ -165,9 +189,9 @@ def test_issue599(en_vocab):
def test_issue600():
vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}})
vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}})
doc = Doc(vocab, words=["hello"])
doc[0].tag_ = 'NN'
doc[0].tag_ = "NN"
def test_issue615(en_tokenizer):
@ -175,16 +199,17 @@ def test_issue615(en_tokenizer):
"""Merge a phrase. We have to be careful here because we'll change the
token indices. To avoid problems, merge all the phrases once we're called
on the last match."""
if i != len(matches)-1:
if i != len(matches) - 1:
return None
spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches]
for ent_id, label, span in spans:
span.merge(tag='NNP' if label else span.root.tag_, lemma=span.text,
label=label)
span.merge(
tag="NNP" if label else span.root.tag_, lemma=span.text, label=label
)
doc.ents = doc.ents + ((label, span.start, span.end),)
text = "The golf club is broken"
pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
pattern = [{"ORTH": "golf"}, {"ORTH": "club"}]
label = "Sport_Equipment"
doc = en_tokenizer(text)
matcher = Matcher(doc.vocab)
@ -195,7 +220,7 @@ def test_issue615(en_tokenizer):
assert entities[0].label != 0
@pytest.mark.parametrize('text,number', [("7am", "7"), ("11p.m.", "11")])
@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")])
def test_issue736(en_tokenizer, text, number):
"""Test that times like "7am" are tokenized correctly and that numbers are
converted to string."""
@ -204,7 +229,7 @@ def test_issue736(en_tokenizer, text, number):
assert tokens[0].text == number
@pytest.mark.parametrize('text', ["3/4/2012", "01/12/1900"])
@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"])
def test_issue740(en_tokenizer, text):
"""Test that dates are not split and kept as one token. This behaviour is
currently inconsistent, since dates separated by hyphens are still split.
@ -214,14 +239,14 @@ def test_issue740(en_tokenizer, text):
def test_issue743():
doc = Doc(Vocab(), ['hello', 'world'])
doc = Doc(Vocab(), ["hello", "world"])
token = doc[0]
s = set([token])
items = list(s)
assert items[0] is token
@pytest.mark.parametrize('text', ["We were scared", "We Were Scared"])
@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"])
def test_issue744(en_tokenizer, text):
"""Test that 'were' and 'Were' are excluded from the contractions
generated by the English tokenizer exceptions."""
@ -230,14 +255,15 @@ def test_issue744(en_tokenizer, text):
assert tokens[1].text.lower() == "were"
@pytest.mark.parametrize('text,is_num', [("one", True), ("ten", True),
("teneleven", False)])
@pytest.mark.parametrize(
"text,is_num", [("one", True), ("ten", True), ("teneleven", False)]
)
def test_issue759(en_tokenizer, text, is_num):
tokens = en_tokenizer(text)
assert tokens[0].like_num == is_num
@pytest.mark.parametrize('text', ["Shell", "shell", "Shed", "shed"])
@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"])
def test_issue775(en_tokenizer, text):
"""Test that 'Shell' and 'shell' are excluded from the contractions
generated by the English tokenizer exceptions."""
@ -246,28 +272,32 @@ def test_issue775(en_tokenizer, text):
assert tokens[0].text == text
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"])
def test_issue792(en_tokenizer, text):
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
doc = en_tokenizer(text)
assert ''.join([token.text_with_ws for token in doc]) == text
assert "".join([token.text_with_ws for token in doc]) == text
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"])
def test_control_issue792(en_tokenizer, text):
"""Test base case for Issue #792: Non-trailing whitespace"""
doc = en_tokenizer(text)
assert ''.join([token.text_with_ws for token in doc]) == text
assert "".join([token.text_with_ws for token in doc]) == text
@pytest.mark.parametrize('text,tokens', [
@pytest.mark.parametrize(
"text,tokens",
[
('"deserve,"--and', ['"', "deserve", ',"--', "and"]),
("exception;--exclusive", ["exception", ";--", "exclusive"]),
("day.--Is", ["day", ".--", "Is"]),
("refinement:--just", ["refinement", ":--", "just"]),
("memories?--To", ["memories", "?--", "To"]),
("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]),
("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"])])
("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]),
],
)
def test_issue801(en_tokenizer, text, tokens):
"""Test that special characters + hyphens are split correctly."""
doc = en_tokenizer(text)
@ -275,10 +305,19 @@ def test_issue801(en_tokenizer, text, tokens):
assert [t.text for t in doc] == tokens
@pytest.mark.parametrize('text,expected_tokens', [
('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']),
('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar'])
])
@pytest.mark.parametrize(
"text,expected_tokens",
[
(
"Smörsåsen används bl.a. till fisk",
["Smörsåsen", "används", "bl.a.", "till", "fisk"],
),
(
"Jag kommer först kl. 13 p.g.a. diverse förseningar",
["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"],
),
],
)
def test_issue805(sv_tokenizer, text, expected_tokens):
tokens = sv_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
@ -291,9 +330,9 @@ def test_issue850():
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
matcher = Matcher(vocab)
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
pattern = [{'LOWER': "bob"}, {'OP': '*', 'IS_ANY_TOKEN': True}, {'LOWER': 'frank'}]
matcher.add('FarAway', None, pattern)
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
pattern = [{"LOWER": "bob"}, {"OP": "*", "IS_ANY_TOKEN": True}, {"LOWER": "frank"}]
matcher.add("FarAway", None, pattern)
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
match = matcher(doc)
assert len(match) == 1
ent_id, start, end = match[0]
@ -306,9 +345,9 @@ def test_issue850_basic():
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
matcher = Matcher(vocab)
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
pattern = [{'LOWER': "bob"}, {'OP': '*', 'LOWER': 'and'}, {'LOWER': 'frank'}]
matcher.add('FarAway', None, pattern)
doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank'])
pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
matcher.add("FarAway", None, pattern)
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
match = matcher(doc)
assert len(match) == 1
ent_id, start, end = match[0]
@ -316,23 +355,25 @@ def test_issue850_basic():
assert end == 4
@pytest.mark.parametrize('text', ["au-delàs", "pair-programmâmes",
"terra-formées", "σ-compacts"])
@pytest.mark.parametrize(
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
)
def test_issue852(fr_tokenizer, text):
"""Test that French tokenizer exceptions are imported correctly."""
tokens = fr_tokenizer(text)
assert len(tokens) == 1
@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
"aaabbb@ccc.com \nThank you!"])
@pytest.mark.parametrize(
"text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"]
)
def test_issue859(en_tokenizer, text):
"""Test that no extra space is added in doc.text method."""
doc = en_tokenizer(text)
assert doc.text == text
@pytest.mark.parametrize('text', ["Datum:2014-06-02\nDokument:76467"])
@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"])
def test_issue886(en_tokenizer, text):
"""Test that token.idx matches the original text index for texts with newlines."""
doc = en_tokenizer(text)
@ -341,7 +382,7 @@ def test_issue886(en_tokenizer, text):
assert text[token.idx] == token.text[0]
@pytest.mark.parametrize('text', ["want/need"])
@pytest.mark.parametrize("text", ["want/need"])
def test_issue891(en_tokenizer, text):
"""Test that / infixes are split correctly."""
tokens = en_tokenizer(text)
@ -349,11 +390,10 @@ def test_issue891(en_tokenizer, text):
assert tokens[1].text == "/"
@pytest.mark.parametrize('text,tag,lemma', [
("anus", "NN", "anus"),
("princess", "NN", "princess"),
("inner", "JJ", "inner")
])
@pytest.mark.parametrize(
"text,tag,lemma",
[("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")],
)
def test_issue912(en_vocab, text, tag, lemma):
"""Test base-forms are preserved."""
doc = Doc(en_vocab, words=[text])
@ -364,10 +404,10 @@ def test_issue912(en_vocab, text, tag, lemma):
def test_issue957(en_tokenizer):
"""Test that spaCy doesn't hang on many periods."""
# skip test if pytest-timeout is not installed
timeout = pytest.importorskip('pytest-timeout')
string = '0'
timeout = pytest.importorskip("pytest-timeout")
string = "0"
for i in range(1, 100):
string += '.%d' % i
string += ".%d" % i
doc = en_tokenizer(string)
@ -386,13 +426,13 @@ def test_issue999(train_data):
["hello", []],
["hi", []],
["i'm looking for a place to eat", []],
["i'm looking for a place in the north of town", [[31,36,"LOCATION"]]],
["show me chinese restaurants", [[8,15,"CUISINE"]]],
["show me chines restaurants", [[8,14,"CUISINE"]]],
["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]],
["show me chinese restaurants", [[8, 15, "CUISINE"]]],
["show me chines restaurants", [[8, 14, "CUISINE"]]],
]
nlp = Language()
ner = nlp.create_pipe('ner')
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
for _, offsets in TRAIN_DATA:
for start, end, label in offsets:
@ -402,7 +442,7 @@ def test_issue999(train_data):
for itn in range(100):
random.shuffle(TRAIN_DATA)
for raw_text, entity_offsets in TRAIN_DATA:
nlp.update([raw_text], [{'entities': entity_offsets}])
nlp.update([raw_text], [{"entities": entity_offsets}])
with make_tempdir() as model_dir:
nlp.to_disk(model_dir)

View File

@ -15,76 +15,84 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
def test_issue1242():
nlp = English()
doc = nlp('')
doc = nlp("")
assert len(doc) == 0
docs = list(nlp.pipe(['', 'hello']))
docs = list(nlp.pipe(["", "hello"]))
assert len(docs[0]) == 0
assert len(docs[1]) == 1
def test_issue1250():
"""Test cached special cases."""
special_case = [{ORTH: 'reimbur', LEMMA: 'reimburse', POS: 'VERB'}]
special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
nlp = English()
nlp.tokenizer.add_special_case('reimbur', special_case)
lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')]
assert lemmas == ['reimburse', ',', 'reimburse', '...']
lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')]
assert lemmas == ['reimburse', ',', 'reimburse', '...']
nlp.tokenizer.add_special_case("reimbur", special_case)
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
assert lemmas == ["reimburse", ",", "reimburse", "..."]
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
assert lemmas == ["reimburse", ",", "reimburse", "..."]
def test_issue1257():
"""Test that tokens compare correctly."""
doc1 = Doc(Vocab(), words=['a', 'b', 'c'])
doc2 = Doc(Vocab(), words=['a', 'c', 'e'])
doc1 = Doc(Vocab(), words=["a", "b", "c"])
doc2 = Doc(Vocab(), words=["a", "c", "e"])
assert doc1[0] != doc2[0]
assert not doc1[0] == doc2[0]
def test_issue1375():
"""Test that token.nbor() raises IndexError for out-of-bounds access."""
doc = Doc(Vocab(), words=['0', '1', '2'])
doc = Doc(Vocab(), words=["0", "1", "2"])
with pytest.raises(IndexError):
assert doc[0].nbor(-1)
assert doc[1].nbor(-1).text == '0'
assert doc[1].nbor(-1).text == "0"
with pytest.raises(IndexError):
assert doc[2].nbor(1)
assert doc[1].nbor(1).text == '2'
assert doc[1].nbor(1).text == "2"
def test_issue1387():
tag_map = {'VBG': {POS: VERB, VerbForm_part: True}}
index = {"verb": ("cope","cop")}
tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
index = {"verb": ("cope", "cop")}
exc = {"verb": {"coping": ("cope",)}}
rules = {"verb": [["ing", ""]]}
lemmatizer = Lemmatizer(index, exc, rules)
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
doc = Doc(vocab, words=["coping"])
doc[0].tag_ = 'VBG'
doc[0].tag_ = "VBG"
assert doc[0].text == "coping"
assert doc[0].lemma_ == "cope"
def test_issue1434():
"""Test matches occur when optional element at end of short doc."""
pattern = [{'ORTH': 'Hello' }, {'IS_ALPHA': True, 'OP': '?'}]
pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
vocab = Vocab(lex_attr_getters=LEX_ATTRS)
hello_world = Doc(vocab, words=['Hello', 'World'])
hello = Doc(vocab, words=['Hello'])
hello_world = Doc(vocab, words=["Hello", "World"])
hello = Doc(vocab, words=["Hello"])
matcher = Matcher(vocab)
matcher.add('MyMatcher', None, pattern)
matcher.add("MyMatcher", None, pattern)
matches = matcher(hello_world)
assert matches
matches = matcher(hello)
assert matches
@pytest.mark.parametrize('string,start,end', [
('a', 0, 1), ('a b', 0, 2), ('a c', 0, 1), ('a b c', 0, 2),
('a b b c', 0, 3), ('a b b', 0, 3),])
@pytest.mark.parametrize(
"string,start,end",
[
("a", 0, 1),
("a b", 0, 2),
("a c", 0, 1),
("a b c", 0, 2),
("a b b c", 0, 3),
("a b b", 0, 3),
],
)
def test_issue1450(string, start, end):
"""Test matcher works when patterns end with * operator."""
pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}]
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
matcher = Matcher(Vocab())
matcher.add("TSTEND", None, pattern)
doc = Doc(Vocab(), words=string.split())
@ -96,17 +104,20 @@ def test_issue1450(string, start, end):
def test_issue1488():
prefix_re = re.compile(r'''[\[\("']''')
suffix_re = re.compile(r'''[\]\)"']''')
infix_re = re.compile(r'''[-~\.]''')
simple_url_re = re.compile(r'''^https?://''')
prefix_re = re.compile(r"""[\[\("']""")
suffix_re = re.compile(r"""[\]\)"']""")
infix_re = re.compile(r"""[-~\.]""")
simple_url_re = re.compile(r"""^https?://""")
def my_tokenizer(nlp):
return Tokenizer(nlp.vocab, {},
return Tokenizer(
nlp.vocab,
{},
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=simple_url_re.match)
token_match=simple_url_re.match,
)
nlp = English()
nlp.tokenizer = my_tokenizer(nlp)
@ -116,11 +127,16 @@ def test_issue1488():
def test_issue1494():
infix_re = re.compile(r'''[^a-z]''')
test_cases = [('token 123test', ['token', '1', '2', '3', 'test']),
('token 1test', ['token', '1test']),
('hello...test', ['hello', '.', '.', '.', 'test'])]
new_tokenizer = lambda nlp: Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
infix_re = re.compile(r"""[^a-z]""")
test_cases = [
("token 123test", ["token", "1", "2", "3", "test"]),
("token 1test", ["token", "1test"]),
("hello...test", ["hello", ".", ".", ".", "test"]),
]
def new_tokenizer(nlp):
return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
nlp = English()
nlp.tokenizer = new_tokenizer(nlp)
for text, expected in test_cases:

View File

@ -45,17 +45,17 @@ def test_issue1506():
def test_issue1518():
"""Test vectors.resize() works."""
vectors = Vectors(shape=(10, 10))
vectors.add('hello', row=2)
vectors.add("hello", row=2)
vectors.resize((5, 9))
def test_issue1537():
"""Test that Span.as_doc() doesn't segfault."""
string = 'The sky is blue . The man is pink . The dog is purple .'
string = "The sky is blue . The man is pink . The dog is purple ."
doc = Doc(Vocab(), words=string.split())
doc[0].sent_start = True
for word in doc[1:]:
if word.nbor(-1).text == '.':
if word.nbor(-1).text == ".":
word.sent_start = True
else:
word.sent_start = False
@ -67,7 +67,7 @@ def test_issue1537():
# TODO: Currently segfaulting, due to l_edge and r_edge misalignment
#def test_issue1537_model():
# def test_issue1537_model():
# nlp = load_spacy('en')
# doc = nlp('The sky is blue. The man is pink. The dog is purple.')
# sents = [s.as_doc() for s in doc.sents]
@ -77,41 +77,41 @@ def test_issue1537():
def test_issue1539():
"""Ensure vectors.resize() doesn't try to modify dictionary during iteration."""
v = Vectors(shape=(10, 10), keys=[5,3,98,100])
v.resize((100,100))
v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100])
v.resize((100, 100))
def test_issue1547():
"""Test that entity labels still match after merging tokens."""
words = ['\n', 'worda', '.', '\n', 'wordb', '-', 'Biosphere', '2', '-', ' \n']
words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
doc = Doc(Vocab(), words=words)
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings['PRODUCT'])]
doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
doc[5:7].merge()
assert [ent.text for ent in doc.ents]
def test_issue1612(en_tokenizer):
doc = en_tokenizer('The black cat purrs.')
span = doc[1: 3]
doc = en_tokenizer("The black cat purrs.")
span = doc[1:3]
assert span.orth_ == span.text
def test_issue1654():
nlp = Language(Vocab())
assert not nlp.pipeline
nlp.add_pipe(lambda doc: doc, name='1')
nlp.add_pipe(lambda doc: doc, name='2', after='1')
nlp.add_pipe(lambda doc: doc, name='3', after='2')
assert nlp.pipe_names == ['1', '2', '3']
nlp.add_pipe(lambda doc: doc, name="1")
nlp.add_pipe(lambda doc: doc, name="2", after="1")
nlp.add_pipe(lambda doc: doc, name="3", after="2")
assert nlp.pipe_names == ["1", "2", "3"]
nlp2 = Language(Vocab())
assert not nlp2.pipeline
nlp2.add_pipe(lambda doc: doc, name='3')
nlp2.add_pipe(lambda doc: doc, name='2', before='3')
nlp2.add_pipe(lambda doc: doc, name='1', before='2')
assert nlp2.pipe_names == ['1', '2', '3']
nlp2.add_pipe(lambda doc: doc, name="3")
nlp2.add_pipe(lambda doc: doc, name="2", before="3")
nlp2.add_pipe(lambda doc: doc, name="1", before="2")
assert nlp2.pipe_names == ["1", "2", "3"]
@pytest.mark.parametrize('text', ['test@example.com', 'john.doe@example.co.uk'])
@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"])
def test_issue1698(en_tokenizer, text):
doc = en_tokenizer(text)
assert len(doc) == 1
@ -121,30 +121,30 @@ def test_issue1698(en_tokenizer, text):
def test_issue1727():
"""Test that models with no pretrained vectors can be deserialized
correctly after vectors are added."""
data = numpy.ones((3, 300), dtype='f')
vectors = Vectors(data=data, keys=['I', 'am', 'Matt'])
data = numpy.ones((3, 300), dtype="f")
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
tagger = Tagger(Vocab())
tagger.add_label('PRP')
tagger.add_label("PRP")
tagger.begin_training()
assert tagger.cfg.get('pretrained_dims', 0) == 0
assert tagger.cfg.get("pretrained_dims", 0) == 0
tagger.vocab.vectors = vectors
with make_tempdir() as path:
tagger.to_disk(path)
tagger = Tagger(Vocab()).from_disk(path)
assert tagger.cfg.get('pretrained_dims', 0) == 0
assert tagger.cfg.get("pretrained_dims", 0) == 0
def test_issue1757():
"""Test comparison against None doesn't cause segfault."""
doc = Doc(Vocab(), words=['a', 'b', 'c'])
doc = Doc(Vocab(), words=["a", "b", "c"])
assert not doc[0] < None
assert not doc[0] == None
assert doc[0] >= None
assert not doc[:2] < None
assert not doc[:2] == None
assert doc[:2] >= None
assert not doc.vocab['a'] == None
assert not doc.vocab['a'] < None
assert not doc.vocab["a"] == None
assert not doc.vocab["a"] < None
def test_issue1758(en_tokenizer):
@ -158,11 +158,20 @@ def test_issue1758(en_tokenizer):
def test_issue1799():
"""Test sentence boundaries are deserialized correctly, even for
non-projective sentences."""
heads_deps = numpy.asarray([[1, 397], [4, 436], [2, 426], [1, 402],
[0, 8206900633647566924], [18446744073709551615, 440],
[18446744073709551614, 442]], dtype='uint64')
doc = Doc(Vocab(), words='Just what I was looking for .'.split())
doc.vocab.strings.add('ROOT')
heads_deps = numpy.asarray(
[
[1, 397],
[4, 436],
[2, 426],
[1, 402],
[0, 8206900633647566924],
[18446744073709551615, 440],
[18446744073709551614, 442],
],
dtype="uint64",
)
doc = Doc(Vocab(), words="Just what I was looking for .".split())
doc.vocab.strings.add("ROOT")
doc = doc.from_array([HEAD, DEP], heads_deps)
assert len(list(doc.sents)) == 1
@ -170,9 +179,9 @@ def test_issue1799():
def test_issue1807():
"""Test vocab.set_vector also adds the word to the vocab."""
vocab = Vocab()
assert 'hello' not in vocab
vocab.set_vector('hello', numpy.ones((50,), dtype='f'))
assert 'hello' in vocab
assert "hello" not in vocab
vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
assert "hello" in vocab
def test_issue1834():
@ -195,34 +204,34 @@ def test_issue1834():
def test_issue1868():
"""Test Vocab.__contains__ works with int keys."""
vocab = Vocab()
lex = vocab['hello']
lex = vocab["hello"]
assert lex.orth in vocab
assert lex.orth_ in vocab
assert 'some string' not in vocab
int_id = vocab.strings.add('some string')
assert "some string" not in vocab
int_id = vocab.strings.add("some string")
assert int_id not in vocab
def test_issue1883():
matcher = Matcher(Vocab())
matcher.add('pat1', None, [{'orth': 'hello'}])
doc = Doc(matcher.vocab, words=['hello'])
matcher.add("pat1", None, [{"orth": "hello"}])
doc = Doc(matcher.vocab, words=["hello"])
assert len(matcher(doc)) == 1
new_matcher = copy.deepcopy(matcher)
new_doc = Doc(new_matcher.vocab, words=['hello'])
new_doc = Doc(new_matcher.vocab, words=["hello"])
assert len(new_matcher(new_doc)) == 1
@pytest.mark.parametrize('word', ['the'])
@pytest.mark.parametrize("word", ["the"])
def test_issue1889(word):
assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS)
def test_issue1915():
cfg = {'hidden_depth': 2} # should error out
cfg = {"hidden_depth": 2} # should error out
nlp = Language()
nlp.add_pipe(nlp.create_pipe('ner'))
nlp.get_pipe('ner').add_label('answer')
nlp.add_pipe(nlp.create_pipe("ner"))
nlp.get_pipe("ner").add_label("answer")
with pytest.raises(ValueError):
nlp.begin_training(**cfg)
@ -230,17 +239,17 @@ def test_issue1915():
def test_issue1945():
"""Test regression in Matcher introduced in v2.0.6."""
matcher = Matcher(Vocab())
matcher.add('MWE', None, [{'orth': 'a'}, {'orth': 'a'}])
doc = Doc(matcher.vocab, words=['a', 'a', 'a'])
matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}])
doc = Doc(matcher.vocab, words=["a", "a", "a"])
matches = matcher(doc) # we should see two overlapping matches here
assert len(matches) == 2
assert matches[0][1:] == (0, 2)
assert matches[1][1:] == (1, 3)
@pytest.mark.parametrize('label', ['U-JOB-NAME'])
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label):
ner = EntityRecognizer(Vocab())
entry = ([0], ['word'], ['tag'], [0], ['dep'], [label])
entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
gold_parses = [(None, [(entry, None)])]
ner.moves.get_actions(gold_parses=gold_parses)

View File

@ -14,15 +14,15 @@ from ..util import add_vecs_to_vocab
def test_issue2179():
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
nlp = Italian()
ner = nlp.create_pipe('ner')
ner.add_label('CITIZENSHIP')
ner = nlp.create_pipe("ner")
ner.add_label("CITIZENSHIP")
nlp.add_pipe(ner)
nlp.begin_training()
nlp2 = Italian()
nlp2.add_pipe(nlp2.create_pipe('ner'))
nlp2.add_pipe(nlp2.create_pipe("ner"))
nlp2.from_bytes(nlp.to_bytes())
assert 'extra_labels' not in nlp2.get_pipe('ner').cfg
assert nlp2.get_pipe('ner').labels == ['CITIZENSHIP']
assert "extra_labels" not in nlp2.get_pipe("ner").cfg
assert nlp2.get_pipe("ner").labels == ["CITIZENSHIP"]
def test_issue2219(en_vocab):
@ -34,7 +34,7 @@ def test_issue2219(en_vocab):
def test_issue2361(de_tokenizer):
chars = ('&lt;', '&gt;', '&amp;', '&quot;')
chars = ("&lt;", "&gt;", "&amp;", "&quot;")
doc = de_tokenizer('< > & " ')
doc.is_parsed = True
doc.is_tagged = True
@ -46,25 +46,32 @@ def test_issue2361(de_tokenizer):
def test_issue2385():
"""Test that IOB tags are correctly converted to BILUO tags."""
# fix bug in labels with a 'b' character
tags1 = ('B-BRAWLER', 'I-BRAWLER', 'I-BRAWLER')
assert iob_to_biluo(tags1) == ['B-BRAWLER', 'I-BRAWLER', 'L-BRAWLER']
tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER")
assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"]
# maintain support for iob1 format
tags2 = ('I-ORG', 'I-ORG', 'B-ORG')
assert iob_to_biluo(tags2) == ['B-ORG', 'L-ORG', 'U-ORG']
tags2 = ("I-ORG", "I-ORG", "B-ORG")
assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"]
# maintain support for iob2 format
tags3 = ('B-PERSON', 'I-PERSON', 'B-PERSON')
assert iob_to_biluo(tags3) ==['B-PERSON', 'L-PERSON', 'U-PERSON']
tags3 = ("B-PERSON", "I-PERSON", "B-PERSON")
assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"]
@pytest.mark.parametrize('tags', [
('B-ORG', 'L-ORG'), ('B-PERSON', 'I-PERSON', 'L-PERSON'), ('U-BRAWLER', 'U-BRAWLER')])
@pytest.mark.parametrize(
"tags",
[
("B-ORG", "L-ORG"),
("B-PERSON", "I-PERSON", "L-PERSON"),
("U-BRAWLER", "U-BRAWLER"),
],
)
def test_issue2385_biluo(tags):
"""Test that BILUO-compatible tags aren't modified."""
assert iob_to_biluo(tags) == list(tags)
def test_issue2482():
'''Test we can serialize and deserialize a blank NER or parser model.'''
"""Test we can serialize and deserialize a blank NER or parser model."""
nlp = Italian()
nlp.add_pipe(nlp.create_pipe('ner'))
nlp.add_pipe(nlp.create_pipe("ner"))
b = nlp.to_bytes()
nlp2 = Italian().from_bytes(b)

View File

@ -7,11 +7,11 @@ from spacy.language import Language
def test_issue2564():
"""Test the tagger sets is_tagged correctly when used via Language.pipe."""
nlp = Language()
tagger = nlp.create_pipe('tagger')
tagger = nlp.create_pipe("tagger")
tagger.begin_training() # initialise weights
nlp.add_pipe(tagger)
doc = nlp('hello world')
doc = nlp("hello world")
assert doc.is_tagged
docs = nlp.pipe(['hello', 'world'])
docs = nlp.pipe(["hello", "world"])
piped_doc = next(docs)
assert piped_doc.is_tagged

View File

@ -7,11 +7,11 @@ from spacy.tokens import Span
def test_issue2569(en_tokenizer):
doc = en_tokenizer("It is May 15, 1993.")
doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings['DATE'])]
doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
matcher = Matcher(doc.vocab)
matcher.add("RULE", None, [{'ENT_TYPE':'DATE', 'OP':'+'}])
matcher.add("RULE", None, [{"ENT_TYPE": "DATE", "OP": "+"}])
matched = [doc[start:end] for _, start, end in matcher(doc)]
matched = sorted(matched, key=len, reverse=True)
assert len(matched) == 10
assert len(matched[0]) == 4
assert matched[0].text == 'May 15, 1993'
assert matched[0].text == "May 15, 1993"

View File

@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.en import English
from spacy.matcher import Matcher
@ -10,6 +9,7 @@ def test_issue2671():
"""Ensure the correct entity ID is returned for matches with quantifiers.
See also #2675
"""
def get_rule_id(nlp, matcher, doc):
matches = matcher(doc)
for match_id, start, end in matches:
@ -19,10 +19,12 @@ def test_issue2671():
nlp = English()
matcher = Matcher(nlp.vocab)
pattern_id = 'test_pattern'
pattern = [{'LOWER': 'high'},
{'IS_PUNCT': True, 'OP': '?'},
{'LOWER': 'adrenaline'}]
pattern_id = "test_pattern"
pattern = [
{"LOWER": "high"},
{"IS_PUNCT": True, "OP": "?"},
{"LOWER": "adrenaline"},
]
matcher.add(pattern_id, None, pattern)
doc1 = nlp("This is a high-adrenaline situation.")
doc2 = nlp("This is a high adrenaline situation.")

View File

@ -1,17 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from ..util import get_doc
def test_issue2772(en_vocab):
"""Test that deprojectivization doesn't mess up sentence boundaries."""
words = 'When we write or communicate virtually , we can hide our true feelings .'.split()
words = "When we write or communicate virtually , we can hide our true feelings .".split()
# A tree with a non-projective (i.e. crossing) arc
# The arcs (0, 4) and (2, 9) cross.
heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1]
deps = ['dep'] * len(heads)
deps = ["dep"] * len(heads)
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
assert doc[1].is_sent_start is None

View File

@ -5,8 +5,8 @@ from spacy.util import get_lang_class
import pytest
@pytest.mark.parametrize('text', ['-0.23', '+123,456', '±1'])
@pytest.mark.parametrize('lang', ['en', 'xx'])
@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"])
@pytest.mark.parametrize("lang", ["en", "xx"])
def test_issue2782(text, lang):
"""Check that like_num handles + and - before number."""
cls = get_lang_class(lang)

View File

@ -18,25 +18,25 @@ def test_serialize_empty_doc(en_vocab):
def test_serialize_doc_roundtrip_bytes(en_vocab):
doc = Doc(en_vocab, words=['hello', 'world'])
doc = Doc(en_vocab, words=["hello", "world"])
doc_b = doc.to_bytes()
new_doc = Doc(en_vocab).from_bytes(doc_b)
assert new_doc.to_bytes() == doc_b
def test_serialize_doc_roundtrip_disk(en_vocab):
doc = Doc(en_vocab, words=['hello', 'world'])
doc = Doc(en_vocab, words=["hello", "world"])
with make_tempdir() as d:
file_path = d / 'doc'
file_path = d / "doc"
doc.to_disk(file_path)
doc_d = Doc(en_vocab).from_disk(file_path)
assert doc.to_bytes() == doc_d.to_bytes()
def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
doc = Doc(en_vocab, words=['hello', 'world'])
doc = Doc(en_vocab, words=["hello", "world"])
with make_tempdir() as d:
file_path = d / 'doc'
file_path = d / "doc"
file_path = path2str(file_path)
doc.to_disk(file_path)
doc_d = Doc(en_vocab).from_disk(file_path)

View File

@ -8,19 +8,20 @@ from spacy.vocab import Vocab
@pytest.fixture
def doc_w_attrs(en_tokenizer):
Doc.set_extension('_test_attr', default=False)
Doc.set_extension('_test_prop', getter=lambda doc: len(doc.text))
Doc.set_extension('_test_method', method=lambda doc, arg: "{}{}".format(len(doc.text), arg))
Doc.set_extension("_test_attr", default=False)
Doc.set_extension("_test_prop", getter=lambda doc: len(doc.text))
Doc.set_extension(
"_test_method", method=lambda doc, arg: "{}{}".format(len(doc.text), arg)
)
doc = en_tokenizer("This is a test.")
doc._._test_attr = 'test'
doc._._test_attr = "test"
return doc
def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
doc_b = doc_w_attrs.to_bytes()
doc = Doc(Vocab()).from_bytes(doc_b)
assert doc._.has('_test_attr')
assert doc._._test_attr == 'test'
assert doc._.has("_test_attr")
assert doc._._test_attr == "test"
assert doc._._test_prop == len(doc.text)
assert doc._._test_method('test') == '{}{}'.format(len(doc.text), 'test')
assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test")

View File

@ -12,14 +12,14 @@ from ..util import make_tempdir
@pytest.fixture
def meta_data():
return {
'name': 'name-in-fixture',
'version': 'version-in-fixture',
'description': 'description-in-fixture',
'author': 'author-in-fixture',
'email': 'email-in-fixture',
'url': 'url-in-fixture',
'license': 'license-in-fixture',
'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None}
"name": "name-in-fixture",
"version": "version-in-fixture",
"description": "description-in-fixture",
"author": "author-in-fixture",
"email": "email-in-fixture",
"url": "url-in-fixture",
"license": "license-in-fixture",
"vectors": {"width": 0, "vectors": 0, "keys": 0, "name": None},
}
@ -35,16 +35,18 @@ def test_serialize_with_custom_tokenizer():
"""Test that serialization with custom tokenizer works without token_match.
See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2
"""
prefix_re = re.compile(r'''1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:''')
suffix_re = re.compile(r'''''')
infix_re = re.compile(r'''[~]''')
prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""")
suffix_re = re.compile(r"""""")
infix_re = re.compile(r"""[~]""")
def custom_tokenizer(nlp):
return Tokenizer(nlp.vocab,
return Tokenizer(
nlp.vocab,
{},
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer)
infix_finditer=infix_re.finditer,
)
nlp = Language()
nlp.tokenizer = custom_tokenizer(nlp)

View File

@ -2,7 +2,8 @@
from __future__ import unicode_literals
import pytest
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer, Tensorizer, TextCategorizer
from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer
from spacy.pipeline import Tensorizer, TextCategorizer
from ..util import make_tempdir
@ -13,7 +14,7 @@ test_parsers = [DependencyParser, EntityRecognizer]
@pytest.fixture
def parser(en_vocab):
parser = DependencyParser(en_vocab)
parser.add_label('nsubj')
parser.add_label("nsubj")
parser.model, cfg = parser.Model(parser.moves.n_moves)
parser.cfg.update(cfg)
return parser
@ -34,7 +35,7 @@ def taggers(en_vocab):
return (tagger1, tagger2)
@pytest.mark.parametrize('Parser', test_parsers)
@pytest.mark.parametrize("Parser", test_parsers)
def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
parser = Parser(en_vocab)
parser.model, _ = parser.Model(10)
@ -44,12 +45,12 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
assert new_parser.to_bytes() == parser.to_bytes()
@pytest.mark.parametrize('Parser', test_parsers)
@pytest.mark.parametrize("Parser", test_parsers)
def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
parser = Parser(en_vocab)
parser.model, _ = parser.Model(0)
with make_tempdir() as d:
file_path = d / 'parser'
file_path = d / "parser"
parser.to_disk(file_path)
parser_d = Parser(en_vocab)
parser_d.model, _ = parser_d.Model(0)
@ -67,7 +68,9 @@ def test_to_from_bytes(parser, blank_parser):
assert blank_parser.moves.n_moves == parser.moves.n_moves
@pytest.mark.skip(reason="This seems to be a dict ordering bug somewhere. Only failing on some platforms.")
@pytest.mark.skip(
reason="This seems to be a dict ordering bug somewhere. Only failing on some platforms."
)
def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
tagger1, tagger2 = taggers
tagger1_b = tagger1.to_bytes()
@ -81,8 +84,8 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
tagger1, tagger2 = taggers
with make_tempdir() as d:
file_path1 = d / 'tagger1'
file_path2 = d / 'tagger2'
file_path1 = d / "tagger1"
file_path2 = d / "tagger2"
tagger1.to_disk(file_path1)
tagger2.to_disk(file_path2)
tagger1_d = Tagger(en_vocab).from_disk(file_path1)
@ -102,7 +105,7 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
tensorizer = Tensorizer(en_vocab)
tensorizer.model = tensorizer.Model()
with make_tempdir() as d:
file_path = d / 'tensorizer'
file_path = d / "tensorizer"
tensorizer.to_disk(file_path)
tensorizer_d = Tensorizer(en_vocab).from_disk(file_path)
assert tensorizer.to_bytes() == tensorizer_d.to_bytes()
@ -110,5 +113,5 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
def test_serialize_textcat_empty(en_vocab):
# See issue #1105
textcat = TextCategorizer(en_vocab, labels=['ENTITY', 'ACTION', 'MODIFIER'])
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
textcat_bytes = textcat.to_bytes()

View File

@ -9,7 +9,7 @@ from ..util import make_tempdir, assert_packed_msg_equal
def load_tokenizer(b):
tok = get_lang_class('en').Defaults.create_tokenizer()
tok = get_lang_class("en").Defaults.create_tokenizer()
tok.from_bytes(b)
return tok
@ -23,7 +23,7 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
@pytest.mark.skip(reason="Currently unreliable across platforms")
@pytest.mark.parametrize('text', ["I💜you", "theyre", "“hello”"])
@pytest.mark.parametrize("text", ["I💜you", "theyre", "“hello”"])
def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
tokenizer = en_tokenizer
new_tokenizer = load_tokenizer(tokenizer.to_bytes())
@ -38,7 +38,7 @@ def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text):
def test_serialize_tokenizer_roundtrip_disk(en_tokenizer):
tokenizer = en_tokenizer
with make_tempdir() as d:
file_path = d / 'tokenizer'
file_path = d / "tokenizer"
tokenizer.to_disk(file_path)
tokenizer_d = en_tokenizer.from_disk(file_path)
assert tokenizer.to_bytes() == tokenizer_d.to_bytes()

View File

@ -8,12 +8,12 @@ from spacy.strings import StringStore
from ..util import make_tempdir
test_strings = [([], []), (['rats', 'are', 'cute'], ['i', 'like', 'rats'])]
test_strings_attrs = [(['rats', 'are', 'cute'], 'Hello')]
test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
@pytest.mark.xfail
@pytest.mark.parametrize('text', ['rat'])
@pytest.mark.parametrize("text", ["rat"])
def test_serialize_vocab(en_vocab, text):
text_hash = en_vocab.strings.add(text)
vocab_bytes = en_vocab.to_bytes()
@ -21,7 +21,7 @@ def test_serialize_vocab(en_vocab, text):
assert new_vocab.strings(text_hash) == text
@pytest.mark.parametrize('strings1,strings2', test_strings)
@pytest.mark.parametrize("strings1,strings2", test_strings)
def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
vocab1 = Vocab(strings=strings1)
vocab2 = Vocab(strings=strings2)
@ -39,13 +39,13 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1)
@pytest.mark.parametrize('strings1,strings2', test_strings)
def test_serialize_vocab_roundtrip_disk(strings1,strings2):
@pytest.mark.parametrize("strings1,strings2", test_strings)
def test_serialize_vocab_roundtrip_disk(strings1, strings2):
vocab1 = Vocab(strings=strings1)
vocab2 = Vocab(strings=strings2)
with make_tempdir() as d:
file_path1 = d / 'vocab1'
file_path2 = d / 'vocab2'
file_path1 = d / "vocab1"
file_path2 = d / "vocab2"
vocab1.to_disk(file_path1)
vocab2.to_disk(file_path2)
vocab1_d = Vocab().from_disk(file_path1)
@ -58,7 +58,7 @@ def test_serialize_vocab_roundtrip_disk(strings1,strings2):
assert list(vocab1_d) != list(vocab2_d)
@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs)
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
vocab1 = Vocab(strings=strings)
vocab2 = Vocab()
@ -69,7 +69,7 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
assert vocab2[strings[0]].norm_ == lex_attr
@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs)
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
vocab1 = Vocab(strings=strings)
vocab2 = Vocab()
@ -77,13 +77,13 @@ def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
assert vocab1[strings[0]].norm_ == lex_attr
assert vocab2[strings[0]].norm_ != lex_attr
with make_tempdir() as d:
file_path = d / 'vocab'
file_path = d / "vocab"
vocab1.to_disk(file_path)
vocab2 = vocab2.from_disk(file_path)
assert vocab2[strings[0]].norm_ == lex_attr
@pytest.mark.parametrize('strings1,strings2', test_strings)
@pytest.mark.parametrize("strings1,strings2", test_strings)
def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
sstore1 = StringStore(strings=strings1)
sstore2 = StringStore(strings=strings2)
@ -100,13 +100,13 @@ def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
assert list(new_sstore1) == strings1
@pytest.mark.parametrize('strings1,strings2', test_strings)
@pytest.mark.parametrize("strings1,strings2", test_strings)
def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
sstore1 = StringStore(strings=strings1)
sstore2 = StringStore(strings=strings2)
with make_tempdir() as d:
file_path1 = d / 'strings1'
file_path2 = d / 'strings2'
file_path1 = d / "strings1"
file_path2 = d / "strings2"
sstore1.to_disk(file_path1)
sstore2.to_disk(file_path2)
sstore1_d = StringStore().from_disk(file_path1)

View File

@ -5,52 +5,63 @@ import pytest
from spacy._align import align, multi_align
@pytest.mark.parametrize('string1,string2,cost', [
('hello', 'hell', 1),
('rat', 'cat', 1),
('rat', 'rat', 0),
('rat', 'catsie', 4),
('t', 'catsie', 5),
])
@pytest.mark.parametrize(
"string1,string2,cost",
[
("hello", "hell", 1),
("rat", "cat", 1),
("rat", "rat", 0),
("rat", "catsie", 4),
("t", "catsie", 5),
],
)
def test_align_costs(string1, string2, cost):
output_cost, i2j, j2i, matrix = align(string1, string2)
assert output_cost == cost
@pytest.mark.parametrize('string1,string2,i2j', [
('hello', 'hell', [0,1,2,3,-1]),
('rat', 'cat', [0,1,2]),
('rat', 'rat', [0,1,2]),
('rat', 'catsie', [0,1,2]),
('t', 'catsie', [2]),
])
@pytest.mark.parametrize(
"string1,string2,i2j",
[
("hello", "hell", [0, 1, 2, 3, -1]),
("rat", "cat", [0, 1, 2]),
("rat", "rat", [0, 1, 2]),
("rat", "catsie", [0, 1, 2]),
("t", "catsie", [2]),
],
)
def test_align_i2j(string1, string2, i2j):
output_cost, output_i2j, j2i, matrix = align(string1, string2)
assert list(output_i2j) == i2j
@pytest.mark.parametrize('string1,string2,j2i', [
('hello', 'hell', [0,1,2,3]),
('rat', 'cat', [0,1,2]),
('rat', 'rat', [0,1,2]),
('rat', 'catsie', [0,1,2, -1, -1, -1]),
('t', 'catsie', [-1, -1, 0, -1, -1, -1]),
])
@pytest.mark.parametrize(
"string1,string2,j2i",
[
("hello", "hell", [0, 1, 2, 3]),
("rat", "cat", [0, 1, 2]),
("rat", "rat", [0, 1, 2]),
("rat", "catsie", [0, 1, 2, -1, -1, -1]),
("t", "catsie", [-1, -1, 0, -1, -1, -1]),
],
)
def test_align_i2j(string1, string2, j2i):
output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
assert list(output_j2i) == j2i
def test_align_strings():
words1 = ['hello', 'this', 'is', 'test!']
words2 = ['hellothis', 'is', 'test', '!']
words1 = ["hello", "this", "is", "test!"]
words2 = ["hellothis", "is", "test", "!"]
cost, i2j, j2i, matrix = align(words1, words2)
assert cost == 4
assert list(i2j) == [-1, -1, 1, -1]
assert list(j2i) == [-1, 2, -1, -1]
def test_align_many_to_one():
words1 = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
words2 = ['ab', 'bc', 'e', 'fg', 'h']
words1 = ["a", "b", "c", "d", "e", "f", "g", "h"]
words2 = ["ab", "bc", "e", "fg", "h"]
cost, i2j, j2i, matrix = align(words1, words2)
assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4]
lengths1 = [len(w) for w in words1]

View File

@ -8,75 +8,78 @@ from .util import get_doc
def test_gold_biluo_U(en_vocab):
orths_and_spaces = [('I', True), ('flew', True), ('to', True),
('London', False), ('.', True)]
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
entities = [(len("I flew to "), len("I flew to London"), 'LOC')]
words = ["I", "flew", "to", "London", "."]
spaces = [True, True, True, False, True]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to London"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'O', 'U-LOC', 'O']
assert tags == ["O", "O", "O", "U-LOC", "O"]
def test_gold_biluo_BL(en_vocab):
orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
('Francisco', False), ('.', True)]
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
entities = [(len("I flew to "), len("I flew to San Francisco"), 'LOC')]
words = ["I", "flew", "to", "San", "Francisco", "."]
spaces = [True, True, True, True, False, True]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O']
assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]
def test_gold_biluo_BIL(en_vocab):
orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
('Francisco', True), ('Valley', False), ('.', True)]
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
spaces = [True, True, True, True, True, False, True]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'O', 'B-LOC', 'I-LOC', 'L-LOC', 'O']
assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
def test_gold_biluo_misalign(en_vocab):
orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True),
('Francisco', True), ('Valley.', False)]
doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')]
words = ["I", "flew", "to", "San", "Francisco", "Valley."]
spaces = [True, True, True, True, True, False]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'O', '-', '-', '-']
assert tags == ["O", "O", "O", "-", "-", "-"]
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
text = "I flew to Silicon Valley via London."
biluo_tags = ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O', 'U-GPE', 'O']
offsets = [(10, 24, 'LOC'), (29, 35, 'GPE')]
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
doc = en_tokenizer(text)
biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
assert biluo_tags_converted == biluo_tags
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
assert offsets_converted == offsets
def test_docs_to_json(en_vocab):
'''Test we can convert a list of Doc objects into the JSON-serializable
"""Test we can convert a list of Doc objects into the JSON-serializable
format we use for training.
'''
"""
docs = [
get_doc(
en_vocab,
words=['a', 'b'],
pos=['VBP', 'NN'],
words=["a", "b"],
pos=["VBP", "NN"],
heads=[0, -1],
deps=['ROOT', 'dobj'],
ents=[]),
deps=["ROOT", "dobj"],
ents=[],
),
get_doc(
en_vocab,
words=['c', 'd', 'e'],
pos=['VBP', 'NN', 'NN'],
words=["c", "d", "e"],
pos=["VBP", "NN", "NN"],
heads=[0, -1, -2],
deps=['ROOT', 'dobj', 'dobj'],
ents=[(1, 2, 'ORG')]),
deps=["ROOT", "dobj", "dobj"],
ents=[(1, 2, "ORG")],
),
]
json_doc = docs_to_json(0, docs)
assert json_doc['id'] == 0
assert len(json_doc['paragraphs']) == 2
assert len(json_doc['paragraphs'][0]['sentences']) == 1
assert len(json_doc['paragraphs'][1]['sentences']) == 1
assert len(json_doc['paragraphs'][0]['sentences'][0]['tokens']) == 2
assert len(json_doc['paragraphs'][1]['sentences'][0]['tokens']) == 3
assert json_doc["id"] == 0
assert len(json_doc["paragraphs"]) == 2
assert len(json_doc["paragraphs"][0]["sentences"]) == 1
assert len(json_doc["paragraphs"][1]["sentences"]) == 1
assert len(json_doc["paragraphs"][0]["sentences"][0]["tokens"]) == 2
assert len(json_doc["paragraphs"][1]["sentences"][0]["tokens"]) == 3

View File

@ -11,19 +11,19 @@ from spacy._ml import PrecomputableAffine
from .util import get_doc
@pytest.mark.parametrize('text', ['hello/world', 'hello world'])
@pytest.mark.parametrize("text", ["hello/world", "hello world"])
def test_util_ensure_path_succeeds(text):
path = util.ensure_path(text)
assert isinstance(path, Path)
@pytest.mark.parametrize('package', ['numpy'])
@pytest.mark.parametrize("package", ["numpy"])
def test_util_is_package(package):
"""Test that an installed package via pip is recognised by util.is_package."""
assert util.is_package(package)
@pytest.mark.parametrize('package', ['thinc'])
@pytest.mark.parametrize("package", ["thinc"])
def test_util_get_package_path(package):
"""Test that a Path object is returned for a package name."""
path = util.get_package_path(package)
@ -33,44 +33,47 @@ def test_util_get_package_path(package):
def test_displacy_parse_ents(en_vocab):
"""Test that named entities on a Doc are converted into displaCy's format."""
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings['ORG'])]
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
ents = displacy.parse_ents(doc)
assert isinstance(ents, dict)
assert ents['text'] == 'But Google is starting from behind '
assert ents['ents'] == [{'start': 4, 'end': 10, 'label': 'ORG'}]
assert ents["text"] == "But Google is starting from behind "
assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}]
def test_displacy_parse_deps(en_vocab):
"""Test that deps and tags on a Doc are converted into displaCy's format."""
words = ["This", "is", "a", "sentence"]
heads = [1, 0, 1, -2]
pos = ['DET', 'VERB', 'DET', 'NOUN']
tags = ['DT', 'VBZ', 'DT', 'NN']
deps = ['nsubj', 'ROOT', 'det', 'attr']
doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags,
deps=deps)
pos = ["DET", "VERB", "DET", "NOUN"]
tags = ["DT", "VBZ", "DT", "NN"]
deps = ["nsubj", "ROOT", "det", "attr"]
doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
deps = displacy.parse_deps(doc)
assert isinstance(deps, dict)
assert deps['words'] == [{'text': 'This', 'tag': 'DET'},
{'text': 'is', 'tag': 'VERB'},
{'text': 'a', 'tag': 'DET'},
{'text': 'sentence', 'tag': 'NOUN'}]
assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
{'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
{'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
assert deps["words"] == [
{"text": "This", "tag": "DET"},
{"text": "is", "tag": "VERB"},
{"text": "a", "tag": "DET"},
{"text": "sentence", "tag": "NOUN"},
]
assert deps["arcs"] == [
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
{"start": 2, "end": 3, "label": "det", "dir": "left"},
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
]
def test_displacy_spans(en_vocab):
"""Test that displaCy can render Spans."""
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings['ORG'])]
html = displacy.render(doc[1:4], style='ent')
assert html.startswith('<div')
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
html = displacy.render(doc[1:4], style="ent")
assert html.startswith("<div")
def test_displacy_raises_for_wrong_type(en_vocab):
with pytest.raises(ValueError):
html = displacy.render('hello world')
html = displacy.render("hello world")
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
@ -78,22 +81,22 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
assert model.W.shape == (nF, nO, nP, nI)
tensor = model.ops.allocate((10, nI))
Y, get_dX = model.begin_update(tensor)
assert Y.shape == (tensor.shape[0]+1, nF, nO, nP)
assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
assert model.d_pad.shape == (1, nF, nO, nP)
dY = model.ops.allocate((15, nO, nP))
ids = model.ops.allocate((15, nF))
ids[1,2] = -1
ids[1, 2] = -1
dY[1] = 1
assert model.d_pad[0, 2, 0, 0] == 0.
assert model.d_pad[0, 2, 0, 0] == 0.0
model._backprop_padding(dY, ids)
assert model.d_pad[0, 2, 0, 0] == 1.
model.d_pad.fill(0.)
ids.fill(0.)
dY.fill(0.)
ids[1,2] = -1
ids[1,1] = -1
ids[1,0] = -1
assert model.d_pad[0, 2, 0, 0] == 1.0
model.d_pad.fill(0.0)
ids.fill(0.0)
dY.fill(0.0)
ids[1, 2] = -1
ids[1, 1] = -1
ids[1, 0] = -1
dY[1] = 1
assert model.d_pad[0, 2, 0, 0] == 0.
assert model.d_pad[0, 2, 0, 0] == 0.0
model._backprop_padding(dY, ids)
assert model.d_pad[0, 2, 0, 0] == 3.
assert model.d_pad[0, 2, 0, 0] == 3.0

View File

@ -9,7 +9,7 @@ from spacy.vocab import Vocab
from spacy.attrs import NORM
@pytest.mark.parametrize('text1,text2', [('hello', 'bye')])
@pytest.mark.parametrize("text1,text2", [("hello", "bye")])
def test_pickle_string_store(text1, text2):
stringstore = StringStore()
store1 = stringstore[text1]
@ -21,10 +21,10 @@ def test_pickle_string_store(text1, text2):
assert len(stringstore) == len(unpickled)
@pytest.mark.parametrize('text1,text2', [('dog', 'cat')])
@pytest.mark.parametrize("text1,text2", [("dog", "cat")])
def test_pickle_vocab(text1, text2):
vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
vocab.set_vector('dog', numpy.ones((5,), dtype='f'))
vocab.set_vector("dog", numpy.ones((5,), dtype="f"))
lex1 = vocab[text1]
lex2 = vocab[text2]
assert lex1.norm_ == text1[:-1]
@ -37,4 +37,4 @@ def test_pickle_vocab(text1, text2):
assert unpickled[text2].norm == lex2.norm
assert unpickled[text1].norm != unpickled[text2].norm
assert unpickled.vectors is not None
assert list(vocab['dog'].vector) == [1.,1.,1.,1.,1.]
assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0]

View File

@ -29,17 +29,19 @@ def test_tokenizer_handles_emoticons(tokenizer):
assert tokens[17].text == ":D"
assert tokens[18].text == "=|"
assert tokens[19].text == '")'
assert tokens[20].text == ':>'
assert tokens[21].text == '....'
assert tokens[20].text == ":>"
assert tokens[21].text == "...."
@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)])
@pytest.mark.parametrize("text,length", [("example:)", 3), ("108)", 2), ("XDN", 1)])
def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length):
tokens = tokenizer(text)
assert len(tokens) == length
@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8),
('i💙you', 3), ('🤘🤘yay!', 4)])
@pytest.mark.parametrize(
"text,length", [("can you still dunk?🍕🍔😵LOL", 8), ("i💙you", 3), ("🤘🤘yay!", 4)]
)
def test_tokenizer_handles_emoji(tokenizer, text, length):
# These break on narrow unicode builds, e.g. Windows
if sys.maxunicode >= 1114111:

View File

@ -12,11 +12,9 @@ NAUGHTY_STRINGS = [
",./;'[]\-=",
'<>?:"{}|_+',
'!@#$%^&*()`~"',
# Unicode additional control characters, byte order marks
"­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪",
"",
# Unicode Symbols
"Ω≈ç√∫˜µ≤≥÷",
"åß∂ƒ©˙∆˚¬…æ",
@ -29,13 +27,11 @@ NAUGHTY_STRINGS = [
"⅛⅜⅝⅞",
"ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя",
"٠١٢٣٤٥٦٧٨٩",
# Unicode Subscript/Superscript/Accents
"⁰⁴⁵",
"₀₁₂",
"⁰⁴⁵₀₁₂",
"ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็",
# Two-Byte Characters
"田中さんにあげて下さい",
"パーティーへ行かないか",
@ -46,7 +42,6 @@ NAUGHTY_STRINGS = [
"社會科學院語學研究所",
"울란바토르",
"𠜎𠜱𠝹𠱓𠱸𠲖𠳏",
# Japanese Emoticons
"ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ",
"(。◕ ∀ ◕。)",
@ -55,11 +50,9 @@ NAUGHTY_STRINGS = [
"・( ̄∀ ̄)・:*:",
"゚・✿ヾ╲(。◕‿◕。)╱✿・゚",
",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’",
"(╯°□°)╯︵ ┻━┻)"
"(ノಥ益ಥ)ノ ┻━┻",
"(╯°□°)╯︵ ┻━┻)" "(ノಥ益ಥ)ノ ┻━┻",
"┬─┬ノ( º _ ºノ)",
"( ͡° ͜ʖ ͡°)",
# Emoji
"😍",
"👩🏽",
@ -69,18 +62,14 @@ NAUGHTY_STRINGS = [
"✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿",
"🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧",
"0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ 🔟",
# Regional Indicator Symbols
"🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸",
"🇺🇸🇷🇺🇸🇦🇫🇦🇲",
"🇺🇸🇷🇺🇸🇦",
# Unicode Numbers
"",
"١٢٣",
# Right-To-Left Strings
"ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.",
"إيو.",
"בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ",
@ -88,34 +77,21 @@ NAUGHTY_STRINGS = [
"",
"",
"مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،",
# Trick Unicode
"test",
"test",
"test",
"testtest",
"test",
# Zalgo Text
"Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣",
"̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰",
"̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟",
"̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕",
"Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮",
# Unicode Upsidedown
"˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥",
"00˙Ɩ$-",
# Unicode font
" ",
"𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠",
@ -125,19 +101,17 @@ NAUGHTY_STRINGS = [
"𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘",
"𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐",
"⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢",
# File paths
"../../../../../../../../../../../etc/passwd%00",
"../../../../../../../../../../../etc/hosts",
# iOS Vulnerabilities
"Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗",
"🏳0🌈"
"🏳0🌈",
]
@pytest.mark.slow
@pytest.mark.parametrize('text', NAUGHTY_STRINGS)
@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
def test_tokenizer_naughty_strings(tokenizer, text):
tokens = tokenizer(text)
assert tokens.text_with_ws == text

Some files were not shown because too many files have changed in this diff Show More