Tidy up and format remaining files

This commit is contained in:
Ines Montani 2018-11-30 17:43:08 +01:00
parent 2a95133138
commit 323fc26880
35 changed files with 391 additions and 366 deletions

View File

@ -11,3 +11,4 @@ exclude =
_tokenizer_exceptions_list.py, _tokenizer_exceptions_list.py,
spacy/lang/fr/lemmatizer, spacy/lang/fr/lemmatizer,
spacy/lang/nb/lemmatizer spacy/lang/nb/lemmatizer
spacy/__init__.py

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import warnings import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed") warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
@ -15,7 +16,7 @@ from . import util
def load(name, **overrides): def load(name, **overrides):
depr_path = overrides.get('path') depr_path = overrides.get("path")
if depr_path not in (True, False, None): if depr_path not in (True, False, None):
deprecation_warning(Warnings.W001.format(path=depr_path)) deprecation_warning(Warnings.W001.format(path=depr_path))
return util.load_model(name, **overrides) return util.load_model(name, **overrides)

View File

@ -29,7 +29,7 @@ from . import util
try: try:
import torch.nn import torch.nn
from thinc.extra.wrappers import PyTorchWrapperRNN from thinc.extra.wrappers import PyTorchWrapperRNN
except: except ImportError:
torch = None torch = None
VECTORS_KEY = "spacy_pretrained_vectors" VECTORS_KEY = "spacy_pretrained_vectors"

View File

@ -479,14 +479,11 @@ class Language(object):
for _, annots_brackets in get_gold_tuples(): for _, annots_brackets in get_gold_tuples():
for annots, _ in annots_brackets: for annots, _ in annots_brackets:
for word in annots[1]: for word in annots[1]:
_ = self.vocab[word] _ = self.vocab[word] # noqa: F841
contexts = []
if cfg.get("device", -1) >= 0: if cfg.get("device", -1) >= 0:
device = util.use_gpu(cfg["device"]) util.use_gpu(cfg["device"])
if self.vocab.vectors.data.shape[1] >= 1: if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data) self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data)
else:
device = None
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
if self.vocab.vectors.data.shape[1]: if self.vocab.vectors.data.shape[1]:
cfg["pretrained_vectors"] = self.vocab.vectors.name cfg["pretrained_vectors"] = self.vocab.vectors.name
@ -742,7 +739,7 @@ class Language(object):
if not hasattr(proc, "from_bytes"): if not hasattr(proc, "from_bytes"):
continue continue
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False) deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
msg = util.from_bytes(bytes_data, deserializers, {}) util.from_bytes(bytes_data, deserializers, {})
return self return self

View File

@ -97,13 +97,13 @@ def da_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ja_tokenizer(): def ja_tokenizer():
mecab = pytest.importorskip("MeCab") pytest.importorskip("MeCab")
return get_lang_class("ja").Defaults.create_tokenizer() return get_lang_class("ja").Defaults.create_tokenizer()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def th_tokenizer(): def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp") pytest.importorskip("pythainlp")
return get_lang_class("th").Defaults.create_tokenizer() return get_lang_class("th").Defaults.create_tokenizer()
@ -112,9 +112,9 @@ def tr_tokenizer():
return get_lang_class("tr").Defaults.create_tokenizer() return get_lang_class("tr").Defaults.create_tokenizer()
@pytest.fixture(scope='session') @pytest.fixture(scope="session")
def ca_tokenizer(): def ca_tokenizer():
return get_lang_class('ca').Defaults.create_tokenizer() return get_lang_class("ca").Defaults.create_tokenizer()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
@ -139,7 +139,7 @@ def ur_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ru_tokenizer(): def ru_tokenizer():
pymorphy = pytest.importorskip("pymorphy2") pytest.importorskip("pymorphy2")
return get_lang_class("ru").Defaults.create_tokenizer() return get_lang_class("ru").Defaults.create_tokenizer()

View File

@ -14,15 +14,13 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
ner = EntityRecognizer(en_vocab) ner = EntityRecognizer(en_vocab)
ner.begin_training([]) ner.begin_training([])
ner(doc) ner(doc)
assert len(list(doc.ents)) == 0 assert len(list(doc.ents)) == 0
assert [w.ent_iob_ for w in doc] == (['O'] * len(doc)) assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
doc.ents = [(doc.vocab.strings['ANIMAL'], 3, 4)]
assert [w.ent_iob_ for w in doc] == ['', '', '', 'B']
doc.ents = [(doc.vocab.strings['WORD'], 0, 2)]
assert [w.ent_iob_ for w in doc] == ['B', 'I', '', '']
def test_add_overlapping_entities(en_vocab): def test_add_overlapping_entities(en_vocab):
text = ["Louisiana", "Office", "of", "Conservation"] text = ["Louisiana", "Office", "of", "Conservation"]

View File

@ -174,18 +174,20 @@ def test_doc_api_merge(en_tokenizer):
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert len(doc) == 9 assert len(doc) == 9
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4: 7], attrs={'tag':'NAMED', 'lemma':'LEMMA', retokenizer.merge(
'ent_type':'TYPE'}) doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
retokenizer.merge(doc[7: 9], attrs={'tag':'NAMED', 'lemma':'LEMMA', )
'ent_type':'TYPE'}) retokenizer.merge(
doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
)
assert len(doc) == 6 assert len(doc) == 6
assert doc[4].text == 'the beach boys' assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == 'the beach boys ' assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == 'NAMED' assert doc[4].tag_ == "NAMED"
assert doc[5].text == 'all night' assert doc[5].text == "all night"
assert doc[5].text_with_ws == 'all night' assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == 'NAMED' assert doc[5].tag_ == "NAMED"
def test_doc_api_merge_children(en_tokenizer): def test_doc_api_merge_children(en_tokenizer):

View File

@ -16,7 +16,7 @@ def test_pickle_single_doc():
def test_list_of_docs_pickles_efficiently(): def test_list_of_docs_pickles_efficiently():
nlp = Language() nlp = Language()
for i in range(10000): for i in range(10000):
_ = nlp.vocab[unicode_(i)] _ = nlp.vocab[unicode_(i)] # noqa: F841
one_pickled = pickle.dumps(nlp("0"), -1) one_pickled = pickle.dumps(nlp("0"), -1)
docs = list(nlp.pipe(unicode_(i) for i in range(100))) docs = list(nlp.pipe(unicode_(i) for i in range(100)))
many_pickled = pickle.dumps(docs, -1) many_pickled = pickle.dumps(docs, -1)
@ -33,7 +33,7 @@ def test_user_data_from_disk():
doc.user_data[(0, 1)] = False doc.user_data[(0, 1)] = False
b = doc.to_bytes() b = doc.to_bytes()
doc2 = doc.__class__(doc.vocab).from_bytes(b) doc2 = doc.__class__(doc.vocab).from_bytes(b)
assert doc2.user_data[(0, 1)] == False assert doc2.user_data[(0, 1)] is False
def test_user_data_unpickles(): def test_user_data_unpickles():
@ -42,7 +42,7 @@ def test_user_data_unpickles():
doc.user_data[(0, 1)] = False doc.user_data[(0, 1)] = False
b = pickle.dumps(doc) b = pickle.dumps(doc)
doc2 = pickle.loads(b) doc2 = pickle.loads(b)
assert doc2.user_data[(0, 1)] == False assert doc2.user_data[(0, 1)] is False
def test_hooks_unpickle(): def test_hooks_unpickle():

View File

@ -87,7 +87,7 @@ def test_span_np_merges(en_tokenizer):
ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents] ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents]
for start, end, label, lemma in ents: for start, end, label, lemma in ents:
merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label) merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label)
assert merged != None, (start, end, label, lemma) assert merged is not None, (start, end, label, lemma)
text = "One test with entities like New York City so the ents list is not void" text = "One test with entities like New York City so the ents list is not void"
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2] heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
@ -95,7 +95,7 @@ def test_span_np_merges(en_tokenizer):
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
for span in doc.ents: for span in doc.ents:
merged = doc.merge() merged = doc.merge()
assert merged != None, (span.start, span.end, span.label_, span.lemma_) assert merged is not None, (span.start, span.end, span.label_, span.lemma_)
def test_spans_entity_merge(en_tokenizer): def test_spans_entity_merge(en_tokenizer):

View File

@ -22,9 +22,9 @@ def test_doc_underscore_getattr_setattr():
doc.user_data = {} doc.user_data = {}
Underscore.doc_extensions["hello"] = (False, None, None, None) Underscore.doc_extensions["hello"] = (False, None, None, None)
doc._ = Underscore(Underscore.doc_extensions, doc) doc._ = Underscore(Underscore.doc_extensions, doc)
assert doc._.hello == False assert doc._.hello is False
doc._.hello = True doc._.hello = True
assert doc._.hello == True assert doc._.hello is True
def test_create_span_underscore(): def test_create_span_underscore():

View File

@ -9,5 +9,5 @@ def test_ar_tokenizer_handles_long_text(ar_tokenizer):
و قد نجح في الحصول على جائزة نوبل للآداب، ليكون بذلك العربي الوحيد الذي فاز بها.""" و قد نجح في الحصول على جائزة نوبل للآداب، ليكون بذلك العربي الوحيد الذي فاز بها."""
tokens = ar_tokenizer(text) tokens = ar_tokenizer(text)
assert tokens[3].is_stop == True assert tokens[3].is_stop is True
assert len(tokens) == 77 assert len(tokens) == 77

View File

@ -9,13 +9,12 @@ from spacy.lang.en.syntax_iterators import SYNTAX_ITERATORS
from ...util import get_doc from ...util import get_doc
def test_en_noun_chunks_not_nested(en_tokenizer): def test_en_noun_chunks_not_nested(en_vocab):
text = "Peter has chronic command and control issues" words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
heads = [1, 0, 4, 3, -1, -2, -5] heads = [1, 0, 4, 3, -1, -2, -5]
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"] deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
tokens = en_tokenizer(text) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) doc.from_array(
tokens.from_array(
[HEAD, DEP], [HEAD, DEP],
numpy.asarray( numpy.asarray(
[ [
@ -30,11 +29,11 @@ def test_en_noun_chunks_not_nested(en_tokenizer):
dtype="uint64", dtype="uint64",
), ),
) )
tokens.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"] doc.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"]
word_occurred = {} word_occurred = {}
for chunk in tokens.noun_chunks: for chunk in doc.noun_chunks:
for word in chunk: for word in chunk:
word_occurred.setdefault(word.text, 0) word_occurred.setdefault(word.text, 0)
word_occurred[word.text] += 1 word_occurred[word.text] += 1
for word, freq in word_occurred.items(): for word, freq in word_occurred.items():
assert freq == 1, (word, [chunk.text for chunk in tokens.noun_chunks]) assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])

View File

@ -5,279 +5,309 @@ import pytest
DEFAULT_TESTS = [ DEFAULT_TESTS = [
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), ("N. kormányzósági\nszékhely.", ["N.", "kormányzósági", "székhely", "."]),
pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()), pytest.param(
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), "A .hu egy tld.", ["A", ".hu", "egy", "tld", "."], marks=pytest.mark.xfail()
('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), ),
('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), ("Az egy.ketto pelda.", ["Az", "egy.ketto", "pelda", "."]),
pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()), ("A pl. rovidites.", ["A", "pl.", "rovidites", "."]),
('Az egy.ketto.', ['Az', 'egy.ketto', '.']), ("A S.M.A.R.T. szo.", ["A", "S.M.A.R.T.", "szo", "."]),
('A pl.', ['A', 'pl.']), pytest.param("A .hu.", ["A", ".hu", "."], marks=pytest.mark.xfail()),
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), ("Az egy.ketto.", ["Az", "egy.ketto", "."]),
('Egy..ket.', ['Egy', '..', 'ket', '.']), ("A pl.", ["A", "pl."]),
('Valami... van.', ['Valami', '...', 'van', '.']), ("A S.M.A.R.T.", ["A", "S.M.A.R.T."]),
('Valami ...van...', ['Valami', '...', 'van', '...']), ("Egy..ket.", ["Egy", "..", "ket", "."]),
('Valami...', ['Valami', '...']), ("Valami... van.", ["Valami", "...", "van", "."]),
('Valami ...', ['Valami', '...']), ("Valami ...van...", ["Valami", "...", "van", "..."]),
('Valami ... más.', ['Valami', '...', 'más', '.']), ("Valami...", ["Valami", "..."]),
('Soha nem lesz!', ['Soha', 'nem', 'lesz', '!']), ("Valami ...", ["Valami", "..."]),
('Soha nem lesz?', ['Soha', 'nem', 'lesz', '?']) ("Valami ... más.", ["Valami", "...", "más", "."]),
("Soha nem lesz!", ["Soha", "nem", "lesz", "!"]),
("Soha nem lesz?", ["Soha", "nem", "lesz", "?"]),
] ]
HYPHEN_TESTS = [ HYPHEN_TESTS = [
('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']), (
('Szabolcs-Szatmár-Bereg megye', ['Szabolcs-Szatmár-Bereg', 'megye']), "Egy -nak, -jaiért, -magyar, bel- van.",
('Egy -nak.', ['Egy', '-nak', '.']), ["Egy", "-nak", ",", "-jaiért", ",", "-magyar", ",", "bel-", "van", "."],
('Egy bel-.', ['Egy', 'bel-', '.']), ),
('Dinnye-domb-.', ['Dinnye-domb-', '.']), ("Szabolcs-Szatmár-Bereg megye", ["Szabolcs-Szatmár-Bereg", "megye"]),
('Ezen -e elcsatangolt.', ['Ezen', '-e', 'elcsatangolt', '.']), ("Egy -nak.", ["Egy", "-nak", "."]),
('Lakik-e', ['Lakik', '-e']), ("Egy bel-.", ["Egy", "bel-", "."]),
('A--B', ['A', '--', 'B']), ("Dinnye-domb-.", ["Dinnye-domb-", "."]),
('Lakik-e?', ['Lakik', '-e', '?']), ("Ezen -e elcsatangolt.", ["Ezen", "-e", "elcsatangolt", "."]),
('Lakik-e.', ['Lakik', '-e', '.']), ("Lakik-e", ["Lakik", "-e"]),
('Lakik-e...', ['Lakik', '-e', '...']), ("A--B", ["A", "--", "B"]),
('Lakik-e... van.', ['Lakik', '-e', '...', 'van', '.']), ("Lakik-e?", ["Lakik", "-e", "?"]),
('Lakik-e van?', ['Lakik', '-e', 'van', '?']), ("Lakik-e.", ["Lakik", "-e", "."]),
('Lakik-elem van?', ['Lakik-elem', 'van', '?']), ("Lakik-e...", ["Lakik", "-e", "..."]),
('Az életbiztosításáról- egy.', ['Az', 'életbiztosításáról-', 'egy', '.']), ("Lakik-e... van.", ["Lakik", "-e", "...", "van", "."]),
('Van lakik-elem.', ['Van', 'lakik-elem', '.']), ("Lakik-e van?", ["Lakik", "-e", "van", "?"]),
('A 7-es busz?', ['A', '7-es', 'busz', '?']), ("Lakik-elem van?", ["Lakik-elem", "van", "?"]),
('A 7-es?', ['A', '7-es', '?']), ("Az életbiztosításáról- egy.", ["Az", "életbiztosításáról-", "egy", "."]),
('A 7-es.', ['A', '7-es', '.']), ("Van lakik-elem.", ["Van", "lakik-elem", "."]),
('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']), ("A 7-es busz?", ["A", "7-es", "busz", "?"]),
('A %-sal.', ['A', '%-sal', '.']), ("A 7-es?", ["A", "7-es", "?"]),
('A $-sal.', ['A', '$-sal', '.']), ("A 7-es.", ["A", "7-es", "."]),
('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.']) ("Ez (lakik)-e?", ["Ez", "(", "lakik", ")", "-e", "?"]),
("A %-sal.", ["A", "%-sal", "."]),
("A $-sal.", ["A", "$-sal", "."]),
("A CD-ROM-okrol.", ["A", "CD-ROM-okrol", "."]),
] ]
NUMBER_TESTS = [ NUMBER_TESTS = [
('A 2b van.', ['A', '2b', 'van', '.']), ("A 2b van.", ["A", "2b", "van", "."]),
('A 2b-ben van.', ['A', '2b-ben', 'van', '.']), ("A 2b-ben van.", ["A", "2b-ben", "van", "."]),
('A 2b.', ['A', '2b', '.']), ("A 2b.", ["A", "2b", "."]),
('A 2b-ben.', ['A', '2b-ben', '.']), ("A 2b-ben.", ["A", "2b-ben", "."]),
('A 3.b van.', ['A', '3.b', 'van', '.']), ("A 3.b van.", ["A", "3.b", "van", "."]),
('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']), ("A 3.b-ben van.", ["A", "3.b-ben", "van", "."]),
('A 3.b.', ['A', '3.b', '.']), ("A 3.b.", ["A", "3.b", "."]),
('A 3.b-ben.', ['A', '3.b-ben', '.']), ("A 3.b-ben.", ["A", "3.b-ben", "."]),
('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']), ("A 1:20:36.7 van.", ["A", "1:20:36.7", "van", "."]),
('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']), ("A 1:20:36.7-ben van.", ["A", "1:20:36.7-ben", "van", "."]),
('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']), ("A 1:20:36.7-ben.", ["A", "1:20:36.7-ben", "."]),
('A 1:35 van.', ['A', '1:35', 'van', '.']), ("A 1:35 van.", ["A", "1:35", "van", "."]),
('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']), ("A 1:35-ben van.", ["A", "1:35-ben", "van", "."]),
('A 1:35-ben.', ['A', '1:35-ben', '.']), ("A 1:35-ben.", ["A", "1:35-ben", "."]),
('A 1.35 van.', ['A', '1.35', 'van', '.']), ("A 1.35 van.", ["A", "1.35", "van", "."]),
('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']), ("A 1.35-ben van.", ["A", "1.35-ben", "van", "."]),
('A 1.35-ben.', ['A', '1.35-ben', '.']), ("A 1.35-ben.", ["A", "1.35-ben", "."]),
('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']), ("A 4:01,95 van.", ["A", "4:01,95", "van", "."]),
('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']), ("A 4:01,95-ben van.", ["A", "4:01,95-ben", "van", "."]),
('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']), ("A 4:01,95-ben.", ["A", "4:01,95-ben", "."]),
('A 10--12 van.', ['A', '10--12', 'van', '.']), ("A 10--12 van.", ["A", "10--12", "van", "."]),
('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']), ("A 10--12-ben van.", ["A", "10--12-ben", "van", "."]),
('A 10--12-ben.', ['A', '10--12-ben', '.']), ("A 10--12-ben.", ["A", "10--12-ben", "."]),
('A 1012 van.', ['A', '1012', 'van', '.']), ("A 1012 van.", ["A", "1012", "van", "."]),
('A 1012-ben van.', ['A', '1012-ben', 'van', '.']), ("A 1012-ben van.", ["A", "1012-ben", "van", "."]),
('A 1012-ben.', ['A', '1012-ben', '.']), ("A 1012-ben.", ["A", "1012-ben", "."]),
('A 1012 van.', ['A', '1012', 'van', '.']), ("A 1012 van.", ["A", "1012", "van", "."]),
('A 1012-ben van.', ['A', '1012-ben', 'van', '.']), ("A 1012-ben van.", ["A", "1012-ben", "van", "."]),
('A 1012-ben.', ['A', '1012-ben', '.']), ("A 1012-ben.", ["A", "1012-ben", "."]),
('A 1012 van.', ['A', '1012', 'van', '.']), ("A 1012 van.", ["A", "1012", "van", "."]),
('A 1012-ben van.', ['A', '1012-ben', 'van', '.']), ("A 1012-ben van.", ["A", "1012-ben", "van", "."]),
('A 1012-ben.', ['A', '1012-ben', '.']), ("A 1012-ben.", ["A", "1012-ben", "."]),
('A 1012 van.', ['A', '1012', 'van', '.']), ("A 1012 van.", ["A", "1012", "van", "."]),
('A 1012-ben van.', ['A', '1012-ben', 'van', '.']), ("A 1012-ben van.", ["A", "1012-ben", "van", "."]),
('A 1012-ben.', ['A', '1012-ben', '.']), ("A 1012-ben.", ["A", "1012-ben", "."]),
('A 10—12 van.', ['A', '10—12', 'van', '.']), ("A 10—12 van.", ["A", "10—12", "van", "."]),
('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']), ("A 10—12-ben van.", ["A", "10—12-ben", "van", "."]),
('A 10—12-ben.', ['A', '10—12-ben', '.']), ("A 10—12-ben.", ["A", "10—12-ben", "."]),
('A 10―12 van.', ['A', '10―12', 'van', '.']), ("A 10―12 van.", ["A", "10―12", "van", "."]),
('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']), ("A 10―12-ben van.", ["A", "10―12-ben", "van", "."]),
('A 10―12-ben.', ['A', '10―12-ben', '.']), ("A 10―12-ben.", ["A", "10―12-ben", "."]),
('A -23,12 van.', ['A', '-23,12', 'van', '.']), ("A -23,12 van.", ["A", "-23,12", "van", "."]),
('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']), ("A -23,12-ben van.", ["A", "-23,12-ben", "van", "."]),
('A -23,12-ben.', ['A', '-23,12-ben', '.']), ("A -23,12-ben.", ["A", "-23,12-ben", "."]),
('A 2+3 van.', ['A', '2+3', 'van', '.']), ("A 2+3 van.", ["A", "2+3", "van", "."]),
('A 2<3 van.', ['A', '2<3', 'van', '.']), ("A 2<3 van.", ["A", "2<3", "van", "."]),
('A 2=3 van.', ['A', '2=3', 'van', '.']), ("A 2=3 van.", ["A", "2=3", "van", "."]),
('A 2÷3 van.', ['A', '2÷3', 'van', '.']), ("A 2÷3 van.", ["A", "2÷3", "van", "."]),
('A 1=(2÷3)-2/5 van.', ['A', '1=(2÷3)-2/5', 'van', '.']), ("A 1=(2÷3)-2/5 van.", ["A", "1=(2÷3)-2/5", "van", "."]),
('A 2 +3 van.', ['A', '2', '+3', 'van', '.']), ("A 2 +3 van.", ["A", "2", "+3", "van", "."]),
('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']), ("A 2+ 3 van.", ["A", "2", "+", "3", "van", "."]),
('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']), ("A 2 + 3 van.", ["A", "2", "+", "3", "van", "."]),
('A 2*3 van.', ['A', '2*3', 'van', '.']), ("A 2*3 van.", ["A", "2*3", "van", "."]),
('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']), ("A 2 *3 van.", ["A", "2", "*", "3", "van", "."]),
('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']), ("A 2* 3 van.", ["A", "2", "*", "3", "van", "."]),
('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']), ("A 2 * 3 van.", ["A", "2", "*", "3", "van", "."]),
('A C++ van.', ['A', 'C++', 'van', '.']), ("A C++ van.", ["A", "C++", "van", "."]),
('A C++-ben van.', ['A', 'C++-ben', 'van', '.']), ("A C++-ben van.", ["A", "C++-ben", "van", "."]),
('A C++.', ['A', 'C++', '.']), ("A C++.", ["A", "C++", "."]),
('A C++-ben.', ['A', 'C++-ben', '.']), ("A C++-ben.", ["A", "C++-ben", "."]),
('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']), ("A 2003. I. 06. van.", ["A", "2003.", "I.", "06.", "van", "."]),
('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']), ("A 2003. I. 06-ben van.", ["A", "2003.", "I.", "06-ben", "van", "."]),
('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']), ("A 2003. I. 06.", ["A", "2003.", "I.", "06."]),
('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']), ("A 2003. I. 06-ben.", ["A", "2003.", "I.", "06-ben", "."]),
('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']), ("A 2003. 01. 06. van.", ["A", "2003.", "01.", "06.", "van", "."]),
('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']), ("A 2003. 01. 06-ben van.", ["A", "2003.", "01.", "06-ben", "van", "."]),
('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']), ("A 2003. 01. 06.", ["A", "2003.", "01.", "06."]),
('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']), ("A 2003. 01. 06-ben.", ["A", "2003.", "01.", "06-ben", "."]),
('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']), ("A IV. 12. van.", ["A", "IV.", "12.", "van", "."]),
('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']), ("A IV. 12-ben van.", ["A", "IV.", "12-ben", "van", "."]),
('A IV. 12.', ['A', 'IV.', '12.']), ("A IV. 12.", ["A", "IV.", "12."]),
('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']), ("A IV. 12-ben.", ["A", "IV.", "12-ben", "."]),
('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']), ("A 2003.01.06. van.", ["A", "2003.01.06.", "van", "."]),
('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']), ("A 2003.01.06-ben van.", ["A", "2003.01.06-ben", "van", "."]),
('A 2003.01.06.', ['A', '2003.01.06.']), ("A 2003.01.06.", ["A", "2003.01.06."]),
('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']), ("A 2003.01.06-ben.", ["A", "2003.01.06-ben", "."]),
('A IV.12. van.', ['A', 'IV.12.', 'van', '.']), ("A IV.12. van.", ["A", "IV.12.", "van", "."]),
('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']), ("A IV.12-ben van.", ["A", "IV.12-ben", "van", "."]),
('A IV.12.', ['A', 'IV.12.']), ("A IV.12.", ["A", "IV.12."]),
('A IV.12-ben.', ['A', 'IV.12-ben', '.']), ("A IV.12-ben.", ["A", "IV.12-ben", "."]),
('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']), ("A 1.1.2. van.", ["A", "1.1.2.", "van", "."]),
('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']), ("A 1.1.2-ben van.", ["A", "1.1.2-ben", "van", "."]),
('A 1.1.2.', ['A', '1.1.2.']), ("A 1.1.2.", ["A", "1.1.2."]),
('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']), ("A 1.1.2-ben.", ["A", "1.1.2-ben", "."]),
('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']), ("A 1,5--2,5 van.", ["A", "1,5--2,5", "van", "."]),
('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']), ("A 1,5--2,5-ben van.", ["A", "1,5--2,5-ben", "van", "."]),
('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']), ("A 1,5--2,5-ben.", ["A", "1,5--2,5-ben", "."]),
('A 3,14 van.', ['A', '3,14', 'van', '.']), ("A 3,14 van.", ["A", "3,14", "van", "."]),
('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']), ("A 3,14-ben van.", ["A", "3,14-ben", "van", "."]),
('A 3,14-ben.', ['A', '3,14-ben', '.']), ("A 3,14-ben.", ["A", "3,14-ben", "."]),
('A 3.14 van.', ['A', '3.14', 'van', '.']), ("A 3.14 van.", ["A", "3.14", "van", "."]),
('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']), ("A 3.14-ben van.", ["A", "3.14-ben", "van", "."]),
('A 3.14-ben.', ['A', '3.14-ben', '.']), ("A 3.14-ben.", ["A", "3.14-ben", "."]),
('A 15. van.', ['A', '15.', 'van', '.']), ("A 15. van.", ["A", "15.", "van", "."]),
('A 15-ben van.', ['A', '15-ben', 'van', '.']), ("A 15-ben van.", ["A", "15-ben", "van", "."]),
('A 15-ben.', ['A', '15-ben', '.']), ("A 15-ben.", ["A", "15-ben", "."]),
('A 15.-ben van.', ['A', '15.-ben', 'van', '.']), ("A 15.-ben van.", ["A", "15.-ben", "van", "."]),
('A 15.-ben.', ['A', '15.-ben', '.']), ("A 15.-ben.", ["A", "15.-ben", "."]),
('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']), ("A 2002--2003. van.", ["A", "2002--2003.", "van", "."]),
('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']), ("A 2002--2003-ben van.", ["A", "2002--2003-ben", "van", "."]),
('A 2002-2003-ben.', ['A', '2002-2003-ben', '.']), ("A 2002-2003-ben.", ["A", "2002-2003-ben", "."]),
('A +0,99% van.', ['A', '+0,99%', 'van', '.']), ("A +0,99% van.", ["A", "+0,99%", "van", "."]),
('A -0,99% van.', ['A', '-0,99%', 'van', '.']), ("A -0,99% van.", ["A", "-0,99%", "van", "."]),
('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']), ("A -0,99%-ben van.", ["A", "-0,99%-ben", "van", "."]),
('A -0,99%.', ['A', '-0,99%', '.']), ("A -0,99%.", ["A", "-0,99%", "."]),
('A -0,99%-ben.', ['A', '-0,99%-ben', '.']), ("A -0,99%-ben.", ["A", "-0,99%-ben", "."]),
('A 10--20% van.', ['A', '10--20%', 'van', '.']), ("A 10--20% van.", ["A", "10--20%", "van", "."]),
('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']), ("A 10--20%-ben van.", ["A", "10--20%-ben", "van", "."]),
('A 10--20%.', ['A', '10--20%', '.']), ("A 10--20%.", ["A", "10--20%", "."]),
('A 10--20%-ben.', ['A', '10--20%-ben', '.']), ("A 10--20%-ben.", ["A", "10--20%-ben", "."]),
('A 99§ van.', ['A', '99§', 'van', '.']), ("A 99§ van.", ["A", "99§", "van", "."]),
('A 99§-ben van.', ['A', '99§-ben', 'van', '.']), ("A 99§-ben van.", ["A", "99§-ben", "van", "."]),
('A 99§-ben.', ['A', '99§-ben', '.']), ("A 99§-ben.", ["A", "99§-ben", "."]),
('A 10--20§ van.', ['A', '10--20§', 'van', '.']), ("A 10--20§ van.", ["A", "10--20§", "van", "."]),
('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']), ("A 10--20§-ben van.", ["A", "10--20§-ben", "van", "."]),
('A 10--20§-ben.', ['A', '10--20§-ben', '.']), ("A 10--20§-ben.", ["A", "10--20§-ben", "."]),
('A 99° van.', ['A', '99°', 'van', '.']), ("A 99° van.", ["A", "99°", "van", "."]),
('A 99°-ben van.', ['A', '99°-ben', 'van', '.']), ("A 99°-ben van.", ["A", "99°-ben", "van", "."]),
('A 99°-ben.', ['A', '99°-ben', '.']), ("A 99°-ben.", ["A", "99°-ben", "."]),
('A 10--20° van.', ['A', '10--20°', 'van', '.']), ("A 10--20° van.", ["A", "10--20°", "van", "."]),
('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']), ("A 10--20°-ben van.", ["A", "10--20°-ben", "van", "."]),
('A 10--20°-ben.', ['A', '10--20°-ben', '.']), ("A 10--20°-ben.", ["A", "10--20°-ben", "."]),
('A °C van.', ['A', '°C', 'van', '.']), ("A °C van.", ["A", "°C", "van", "."]),
('A °C-ben van.', ['A', '°C-ben', 'van', '.']), ("A °C-ben van.", ["A", "°C-ben", "van", "."]),
('A °C.', ['A', '°C', '.']), ("A °C.", ["A", "°C", "."]),
('A °C-ben.', ['A', '°C-ben', '.']), ("A °C-ben.", ["A", "°C-ben", "."]),
('A 100°C van.', ['A', '100°C', 'van', '.']), ("A 100°C van.", ["A", "100°C", "van", "."]),
('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']), ("A 100°C-ben van.", ["A", "100°C-ben", "van", "."]),
('A 100°C.', ['A', '100°C', '.']), ("A 100°C.", ["A", "100°C", "."]),
('A 100°C-ben.', ['A', '100°C-ben', '.']), ("A 100°C-ben.", ["A", "100°C-ben", "."]),
('A 800x600 van.', ['A', '800x600', 'van', '.']), ("A 800x600 van.", ["A", "800x600", "van", "."]),
('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']), ("A 800x600-ben van.", ["A", "800x600-ben", "van", "."]),
('A 800x600-ben.', ['A', '800x600-ben', '.']), ("A 800x600-ben.", ["A", "800x600-ben", "."]),
('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']), ("A 1x2x3x4 van.", ["A", "1x2x3x4", "van", "."]),
('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']), ("A 1x2x3x4-ben van.", ["A", "1x2x3x4-ben", "van", "."]),
('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']), ("A 1x2x3x4-ben.", ["A", "1x2x3x4-ben", "."]),
('A 5/J van.', ['A', '5/J', 'van', '.']), ("A 5/J van.", ["A", "5/J", "van", "."]),
('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']), ("A 5/J-ben van.", ["A", "5/J-ben", "van", "."]),
('A 5/J-ben.', ['A', '5/J-ben', '.']), ("A 5/J-ben.", ["A", "5/J-ben", "."]),
('A 5/J. van.', ['A', '5/J.', 'van', '.']), ("A 5/J. van.", ["A", "5/J.", "van", "."]),
('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']), ("A 5/J.-ben van.", ["A", "5/J.-ben", "van", "."]),
('A 5/J.-ben.', ['A', '5/J.-ben', '.']), ("A 5/J.-ben.", ["A", "5/J.-ben", "."]),
('A III/1 van.', ['A', 'III/1', 'van', '.']), ("A III/1 van.", ["A", "III/1", "van", "."]),
('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']), ("A III/1-ben van.", ["A", "III/1-ben", "van", "."]),
('A III/1-ben.', ['A', 'III/1-ben', '.']), ("A III/1-ben.", ["A", "III/1-ben", "."]),
('A III/1. van.', ['A', 'III/1.', 'van', '.']), ("A III/1. van.", ["A", "III/1.", "van", "."]),
('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']), ("A III/1.-ben van.", ["A", "III/1.-ben", "van", "."]),
('A III/1.-ben.', ['A', 'III/1.-ben', '.']), ("A III/1.-ben.", ["A", "III/1.-ben", "."]),
('A III/c van.', ['A', 'III/c', 'van', '.']), ("A III/c van.", ["A", "III/c", "van", "."]),
('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']), ("A III/c-ben van.", ["A", "III/c-ben", "van", "."]),
('A III/c.', ['A', 'III/c', '.']), ("A III/c.", ["A", "III/c", "."]),
('A III/c-ben.', ['A', 'III/c-ben', '.']), ("A III/c-ben.", ["A", "III/c-ben", "."]),
('A TU154 van.', ['A', 'TU154', 'van', '.']), ("A TU154 van.", ["A", "TU154", "van", "."]),
('A TU154-ben van.', ['A', 'TU154-ben', 'van', '.']), ("A TU154-ben van.", ["A", "TU154-ben", "van", "."]),
('A TU154-ben.', ['A', 'TU154-ben', '.']), ("A TU154-ben.", ["A", "TU154-ben", "."]),
('A 5cm³', ['A', '5', 'cm³']), ("A 5cm³", ["A", "5", "cm³"]),
('A 5 $-ban', ['A', '5', '$-ban']), ("A 5 $-ban", ["A", "5", "$-ban"]),
('A 5$-ban', ['A', '5$-ban']), ("A 5$-ban", ["A", "5$-ban"]),
('A 5$.', ['A', '5', '$', '.']), ("A 5$.", ["A", "5", "$", "."]),
('A 5$', ['A', '5', '$']), ("A 5$", ["A", "5", "$"]),
('A $5', ['A', '$5']), ("A $5", ["A", "$5"]),
('A 5km/h', ['A', '5', 'km/h']), ("A 5km/h", ["A", "5", "km/h"]),
('A 75%+1-100%-ig', ['A', '75%+1-100%-ig']), ("A 75%+1-100%-ig", ["A", "75%+1-100%-ig"]),
('A 5km/h.', ['A', '5', 'km/h', '.']), ("A 5km/h.", ["A", "5", "km/h", "."]),
('3434/1992. évi elszámolás', ['3434/1992.', 'évi', 'elszámolás']), ("3434/1992. évi elszámolás", ["3434/1992.", "évi", "elszámolás"]),
] ]
QUOTE_TESTS = [ QUOTE_TESTS = [
('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), (
('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), 'Az "Ime, hat"-ban irja.',
('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']), ["Az", '"', "Ime", ",", "hat", '"', "-ban", "irja", "."],
('Egy 24"-os monitor.', ['Egy', '24"-os', 'monitor', '.']), ),
("A McDonald's van.", ['A', "McDonald's", 'van', '.']) ('"Ime, hat"-ban irja.', ['"', "Ime", ",", "hat", '"', "-ban", "irja", "."]),
('Az "Ime, hat".', ["Az", '"', "Ime", ",", "hat", '"', "."]),
('Egy 24"-os monitor.', ["Egy", '24"-os', "monitor", "."]),
("A McDonald's van.", ["A", "McDonald's", "van", "."]),
] ]
DOT_TESTS = [ DOT_TESTS = [
('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), ("N. kormányzósági\nszékhely.", ["N.", "kormányzósági", "székhely", "."]),
pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()), pytest.param(
('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), "A .hu egy tld.", ["A", ".hu", "egy", "tld", "."], marks=pytest.mark.xfail()
('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']), ),
('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']), ("Az egy.ketto pelda.", ["Az", "egy.ketto", "pelda", "."]),
pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()), ("A pl. rövidítés.", ["A", "pl.", "rövidítés", "."]),
('Az egy.ketto.', ['Az', 'egy.ketto', '.']), ("A S.M.A.R.T. szó.", ["A", "S.M.A.R.T.", "szó", "."]),
('A pl.', ['A', 'pl.']), pytest.param("A .hu.", ["A", ".hu", "."], marks=pytest.mark.xfail()),
('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), ("Az egy.ketto.", ["Az", "egy.ketto", "."]),
('Egy..ket.', ['Egy', '..', 'ket', '.']), ("A pl.", ["A", "pl."]),
('Valami... van.', ['Valami', '...', 'van', '.']), ("A S.M.A.R.T.", ["A", "S.M.A.R.T."]),
('Valami ...van...', ['Valami', '...', 'van', '...']), ("Egy..ket.", ["Egy", "..", "ket", "."]),
('Valami...', ['Valami', '...']), ("Valami... van.", ["Valami", "...", "van", "."]),
('Valami ...', ['Valami', '...']), ("Valami ...van...", ["Valami", "...", "van", "..."]),
('Valami ... más.', ['Valami', '...', 'más', '.']) ("Valami...", ["Valami", "..."]),
("Valami ...", ["Valami", "..."]),
("Valami ... más.", ["Valami", "...", "más", "."]),
] ]
TYPO_TESTS = [ TYPO_TESTS = [
( (
'Ez egy mondat vége.Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']), "Ez egy mondat vége.Ez egy másik eleje.",
('Ez egy mondat vége .Ez egy másik eleje.', ["Ez", "egy", "mondat", "vége", ".", "Ez", "egy", "másik", "eleje", "."],
['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']), ),
( (
'Ez egy mondat vége!ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']), "Ez egy mondat vége .Ez egy másik eleje.",
('Ez egy mondat vége !ez egy másik eleje.', ["Ez", "egy", "mondat", "vége", ".", "Ez", "egy", "másik", "eleje", "."],
['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']), ),
( (
'Ez egy mondat vége?Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']), "Ez egy mondat vége!ez egy másik eleje.",
('Ez egy mondat vége ?Ez egy másik eleje.', ["Ez", "egy", "mondat", "vége", "!", "ez", "egy", "másik", "eleje", "."],
['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']), ),
('egy,kettő', ['egy', ',', 'kettő']), (
('egy ,kettő', ['egy', ',', 'kettő']), "Ez egy mondat vége !ez egy másik eleje.",
('egy :kettő', ['egy', ':', 'kettő']), ["Ez", "egy", "mondat", "vége", "!", "ez", "egy", "másik", "eleje", "."],
),
(
"Ez egy mondat vége?Ez egy másik eleje.",
["Ez", "egy", "mondat", "vége", "?", "Ez", "egy", "másik", "eleje", "."],
),
(
"Ez egy mondat vége ?Ez egy másik eleje.",
["Ez", "egy", "mondat", "vége", "?", "Ez", "egy", "másik", "eleje", "."],
),
("egy,kettő", ["egy", ",", "kettő"]),
("egy ,kettő", ["egy", ",", "kettő"]),
("egy :kettő", ["egy", ":", "kettő"]),
] ]
WIKI_TESTS = [ WIKI_TESTS = [
('!"', ['!', '"']), ('!"', ["!", '"']),
('lány"a', ['lány', '"', 'a']), ('lány"a', ["lány", '"', "a"]),
('lány"a', ['lány', '"', 'a']), ('lány"a', ["lány", '"', "a"]),
('!"-lel', ['!', '"', '-lel']), ('!"-lel', ["!", '"', "-lel"]),
('""-sorozat ', ['"', '"', '-sorozat']), ('""-sorozat ', ['"', '"', "-sorozat"]),
('"(Köszönöm', ['"', '(', 'Köszönöm']), ('"(Köszönöm', ['"', "(", "Köszönöm"]),
('(törvénykönyv)-ben ', ['(', 'törvénykönyv', ')', '-ben']), ("(törvénykönyv)-ben ", ["(", "törvénykönyv", ")", "-ben"]),
('"(...)"sokkal ', ['"', '(', '...', ')', '"', 'sokkal']), ('"(...)"sokkal ', ['"', "(", "...", ")", '"', "sokkal"]),
('cérium(IV)-oxid', ['cérium', '(', 'IV', ')', '-oxid']) ("cérium(IV)-oxid", ["cérium", "(", "IV", ")", "-oxid"]),
] ]
TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS TESTCASES = (
DEFAULT_TESTS
+ DOT_TESTS
+ QUOTE_TESTS
+ NUMBER_TESTS
+ HYPHEN_TESTS
+ WIKI_TESTS
+ TYPO_TESTS
)
@pytest.mark.parametrize('text,expected_tokens', TESTCASES) @pytest.mark.parametrize("text,expected_tokens", TESTCASES)
def test_hu_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens): def test_hu_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens):
tokens = hu_tokenizer(text) tokens = hu_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space] token_list = [token.text for token in tokens if not token.is_space]

View File

@ -9,7 +9,7 @@ from ...util import get_doc
@pytest.fixture @pytest.fixture
def ru_lemmatizer(): def ru_lemmatizer():
pymorphy = pytest.importorskip("pymorphy2") pytest.importorskip("pymorphy2")
return Russian.Defaults.create_lemmatizer() return Russian.Defaults.create_lemmatizer()

View File

@ -51,7 +51,7 @@ def test_matcher_from_usage_docs(en_vocab):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)
matcher.add("HAPPY", label_sentiment, *pos_patterns) matcher.add("HAPPY", label_sentiment, *pos_patterns)
matches = matcher(doc) matcher(doc)
assert doc.sentiment != 0 assert doc.sentiment != 0
assert doc[1].norm_ == "happy emoji" assert doc[1].norm_ == "happy emoji"

View File

@ -145,4 +145,4 @@ def test_get_oracle_actions():
heads, deps = projectivize(heads, deps) heads, deps = projectivize(heads, deps)
gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps) gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
parser.moves.preprocess_gold(gold) parser.moves.preprocess_gold(gold)
actions = parser.moves.get_oracle_sequence(doc, gold) parser.moves.get_oracle_sequence(doc, gold)

View File

@ -57,6 +57,7 @@ def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots):
tsys.preprocess_gold(gold) tsys.preprocess_gold(gold)
act_classes = tsys.get_oracle_sequence(doc, gold) act_classes = tsys.get_oracle_sequence(doc, gold)
names = [tsys.get_class_name(act) for act in act_classes] names = [tsys.get_class_name(act) for act in act_classes]
assert names
def test_get_oracle_moves_negative_entities2(tsys, vocab): def test_get_oracle_moves_negative_entities2(tsys, vocab):
@ -66,6 +67,7 @@ def test_get_oracle_moves_negative_entities2(tsys, vocab):
tsys.preprocess_gold(gold) tsys.preprocess_gold(gold)
act_classes = tsys.get_oracle_sequence(doc, gold) act_classes = tsys.get_oracle_sequence(doc, gold)
names = [tsys.get_class_name(act) for act in act_classes] names = [tsys.get_class_name(act) for act in act_classes]
assert names
def test_get_oracle_moves_negative_O(tsys, vocab): def test_get_oracle_moves_negative_O(tsys, vocab):
@ -75,6 +77,7 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
tsys.preprocess_gold(gold) tsys.preprocess_gold(gold)
act_classes = tsys.get_oracle_sequence(doc, gold) act_classes = tsys.get_oracle_sequence(doc, gold)
names = [tsys.get_class_name(act) for act in act_classes] names = [tsys.get_class_name(act) for act in act_classes]
assert names
def test_doc_add_entities_set_ents_iob(en_vocab): def test_doc_add_entities_set_ents_iob(en_vocab):

View File

@ -47,34 +47,34 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree):
def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree):
assert contains_cycle(tree) == None assert contains_cycle(tree) is None
assert contains_cycle(cyclic_tree) == set([3, 4, 5]) assert contains_cycle(cyclic_tree) == set([3, 4, 5])
assert contains_cycle(partial_tree) == None assert contains_cycle(partial_tree) is None
assert contains_cycle(multirooted_tree) == None assert contains_cycle(multirooted_tree) is None
def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree): def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree):
assert is_nonproj_arc(0, nonproj_tree) == False assert is_nonproj_arc(0, nonproj_tree) is False
assert is_nonproj_arc(1, nonproj_tree) == False assert is_nonproj_arc(1, nonproj_tree) is False
assert is_nonproj_arc(2, nonproj_tree) == False assert is_nonproj_arc(2, nonproj_tree) is False
assert is_nonproj_arc(3, nonproj_tree) == False assert is_nonproj_arc(3, nonproj_tree) is False
assert is_nonproj_arc(4, nonproj_tree) == False assert is_nonproj_arc(4, nonproj_tree) is False
assert is_nonproj_arc(5, nonproj_tree) == False assert is_nonproj_arc(5, nonproj_tree) is False
assert is_nonproj_arc(6, nonproj_tree) == False assert is_nonproj_arc(6, nonproj_tree) is False
assert is_nonproj_arc(7, nonproj_tree) == True assert is_nonproj_arc(7, nonproj_tree) is True
assert is_nonproj_arc(8, nonproj_tree) == False assert is_nonproj_arc(8, nonproj_tree) is False
assert is_nonproj_arc(7, partial_tree) == False assert is_nonproj_arc(7, partial_tree) is False
assert is_nonproj_arc(17, multirooted_tree) == False assert is_nonproj_arc(17, multirooted_tree) is False
assert is_nonproj_arc(16, multirooted_tree) == True assert is_nonproj_arc(16, multirooted_tree) is True
def test_parser_is_nonproj_tree( def test_parser_is_nonproj_tree(
proj_tree, nonproj_tree, partial_tree, multirooted_tree proj_tree, nonproj_tree, partial_tree, multirooted_tree
): ):
assert is_nonproj_tree(proj_tree) == False assert is_nonproj_tree(proj_tree) is False
assert is_nonproj_tree(nonproj_tree) == True assert is_nonproj_tree(nonproj_tree) is True
assert is_nonproj_tree(partial_tree) == False assert is_nonproj_tree(partial_tree) is False
assert is_nonproj_tree(multirooted_tree) == True assert is_nonproj_tree(multirooted_tree) is True
def test_parser_pseudoprojectivity(en_tokenizer): def test_parser_pseudoprojectivity(en_tokenizer):
@ -100,8 +100,8 @@ def test_parser_pseudoprojectivity(en_tokenizer):
assert nonproj.decompose("X||Y") == ("X", "Y") assert nonproj.decompose("X||Y") == ("X", "Y")
assert nonproj.decompose("X") == ("X", "") assert nonproj.decompose("X") == ("X", "")
assert nonproj.is_decorated("X||Y") == True assert nonproj.is_decorated("X||Y") is True
assert nonproj.is_decorated("X") == False assert nonproj.is_decorated("X") is False
nonproj._lift(0, tree) nonproj._lift(0, tree)
assert tree == [2, 2, 2] assert tree == [2, 2, 2]

View File

@ -25,7 +25,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
) )
assert len(doc) == 1 assert len(doc) == 1
with en_parser.step_through(doc) as _: with en_parser.step_through(doc) as _: # noqa: F841
pass pass
assert doc[0].dep != 0 assert doc[0].dep != 0
@ -33,7 +33,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text):
@pytest.mark.xfail @pytest.mark.xfail
def test_parser_initial(en_tokenizer, en_parser): def test_parser_initial(en_tokenizer, en_parser):
text = "I ate the pizza with anchovies." text = "I ate the pizza with anchovies."
heads = [1, 0, 1, -2, -3, -1, -5] # heads = [1, 0, 1, -2, -3, -1, -5]
transition = ["L-nsubj", "S", "L-det"] transition = ["L-nsubj", "S", "L-det"]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
apply_transition_sequence(en_parser, tokens, transition) apply_transition_sequence(en_parser, tokens, transition)

View File

@ -71,7 +71,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser):
def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length): def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length):
doc = Doc(en_parser.vocab, words=text) doc = Doc(en_parser.vocab, words=text)
assert len(doc) == length assert len(doc) == length
with en_parser.step_through(doc) as _: with en_parser.step_through(doc) as _: # noqa: F841
pass pass
assert doc[0].is_space assert doc[0].is_space
for token in doc: for token in doc:

View File

@ -20,7 +20,6 @@ def test_train_with_many_entity_types():
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
for i in range(20): for i in range(20):
losses = {} losses = {}
index = 0
random.shuffle(train_data) random.shuffle(train_data)
for statement, entities in train_data: for statement, entities in train_data:
nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5) nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5)

View File

@ -152,6 +152,7 @@ def test_issue589():
vocab = Vocab() vocab = Vocab()
vocab.strings.set_frozen(True) vocab.strings.set_frozen(True)
doc = Doc(vocab, words=["whata"]) doc = Doc(vocab, words=["whata"])
assert doc
def test_issue590(en_vocab): def test_issue590(en_vocab):
@ -216,7 +217,7 @@ def test_issue615(en_tokenizer):
doc = en_tokenizer(text) doc = en_tokenizer(text)
matcher = Matcher(doc.vocab) matcher = Matcher(doc.vocab)
matcher.add(label, merge_phrases, pattern) matcher.add(label, merge_phrases, pattern)
match = matcher(doc) matcher(doc)
entities = list(doc.ents) entities = list(doc.ents)
assert entities != [] assert entities != []
assert entities[0].label != 0 assert entities[0].label != 0
@ -331,8 +332,7 @@ def test_issue850():
handle the ambiguity correctly.""" handle the ambiguity correctly."""
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
matcher = Matcher(vocab) matcher = Matcher(vocab)
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}]
pattern = [{"LOWER": "bob"}, {"OP": "*", "IS_ANY_TOKEN": True}, {"LOWER": "frank"}]
matcher.add("FarAway", None, pattern) matcher.add("FarAway", None, pattern)
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
match = matcher(doc) match = matcher(doc)
@ -346,7 +346,6 @@ def test_issue850_basic():
"""Test Matcher matches with '*' operator and Boolean flag""" """Test Matcher matches with '*' operator and Boolean flag"""
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
matcher = Matcher(vocab) matcher = Matcher(vocab)
IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True)
pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
matcher.add("FarAway", None, pattern) matcher.add("FarAway", None, pattern)
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
@ -405,12 +404,13 @@ def test_issue912(en_vocab, text, tag, lemma):
def test_issue957(en_tokenizer): def test_issue957(en_tokenizer):
"""Test that spaCy doesn't hang on many periods.""" """Test that spaCy doesn't hang on many periods."""
# skip test if pytest-timeout is not installed # Skip test if pytest-timeout is not installed
timeout = pytest.importorskip("pytest-timeout") pytest.importorskip("pytest-timeout")
string = "0" string = "0"
for i in range(1, 100): for i in range(1, 100):
string += ".%d" % i string += ".%d" % i
doc = en_tokenizer(string) doc = en_tokenizer(string)
assert doc
@pytest.mark.xfail @pytest.mark.xfail

View File

@ -138,12 +138,12 @@ def test_issue1757():
"""Test comparison against None doesn't cause segfault.""" """Test comparison against None doesn't cause segfault."""
doc = Doc(Vocab(), words=["a", "b", "c"]) doc = Doc(Vocab(), words=["a", "b", "c"])
assert not doc[0] < None assert not doc[0] < None
assert not doc[0] == None assert not doc[0] is None
assert doc[0] >= None assert doc[0] >= None
assert not doc[:2] < None assert not doc[:2] < None
assert not doc[:2] == None assert not doc[:2] is None
assert doc[:2] >= None assert doc[:2] >= None
assert not doc.vocab["a"] == None assert not doc.vocab["a"] is None
assert not doc.vocab["a"] < None assert not doc.vocab["a"] < None

View File

@ -74,4 +74,4 @@ def test_issue2482():
nlp = Italian() nlp = Italian()
nlp.add_pipe(nlp.create_pipe("ner")) nlp.add_pipe(nlp.create_pipe("ner"))
b = nlp.to_bytes() b = nlp.to_bytes()
nlp2 = Italian().from_bytes(b) Italian().from_bytes(b)

View File

@ -8,3 +8,4 @@ def test_issue2626(en_tokenizer):
ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume
""" """
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert doc

View File

@ -9,14 +9,6 @@ def test_issue2671():
"""Ensure the correct entity ID is returned for matches with quantifiers. """Ensure the correct entity ID is returned for matches with quantifiers.
See also #2675 See also #2675
""" """
def get_rule_id(nlp, matcher, doc):
matches = matcher(doc)
for match_id, start, end in matches:
rule_id = nlp.vocab.strings[match_id]
span = doc[start:end]
return rule_id
nlp = English() nlp = English()
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
pattern_id = "test_pattern" pattern_id = "test_pattern"
@ -28,5 +20,9 @@ def test_issue2671():
matcher.add(pattern_id, None, pattern) matcher.add(pattern_id, None, pattern)
doc1 = nlp("This is a high-adrenaline situation.") doc1 = nlp("This is a high-adrenaline situation.")
doc2 = nlp("This is a high adrenaline situation.") doc2 = nlp("This is a high adrenaline situation.")
assert get_rule_id(nlp, matcher, doc1) == pattern_id matches1 = matcher(doc1)
assert get_rule_id(nlp, matcher, doc2) == pattern_id for match_id, start, end in matches1:
assert nlp.vocab.strings[match_id] == pattern_id
matches2 = matcher(doc2)
for match_id, start, end in matches2:
assert nlp.vocab.strings[match_id] == pattern_id

View File

@ -72,9 +72,8 @@ def test_to_from_bytes(parser, blank_parser):
reason="This seems to be a dict ordering bug somewhere. Only failing on some platforms." reason="This seems to be a dict ordering bug somewhere. Only failing on some platforms."
) )
def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
tagger1, tagger2 = taggers tagger1 = taggers[0]
tagger1_b = tagger1.to_bytes() tagger1_b = tagger1.to_bytes()
tagger2_b = tagger2.to_bytes()
tagger1 = tagger1.from_bytes(tagger1_b) tagger1 = tagger1.from_bytes(tagger1_b)
assert tagger1.to_bytes() == tagger1_b assert tagger1.to_bytes() == tagger1_b
new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b) new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
@ -114,4 +113,4 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
def test_serialize_textcat_empty(en_vocab): def test_serialize_textcat_empty(en_vocab):
# See issue #1105 # See issue #1105
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"]) textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
textcat_bytes = textcat.to_bytes() textcat.to_bytes()

View File

@ -19,7 +19,7 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
serialized and deserialized correctly (see #2494).""" serialized and deserialized correctly (see #2494)."""
tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search) tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search)
tokenizer_bytes = tokenizer.to_bytes() tokenizer_bytes = tokenizer.to_bytes()
new_tokenizer = Tokenizer(en_vocab).from_bytes(tokenizer_bytes) Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
@pytest.mark.skip(reason="Currently unreliable across platforms") @pytest.mark.skip(reason="Currently unreliable across platforms")

View File

@ -45,7 +45,7 @@ def test_align_i2j(string1, string2, i2j):
("t", "catsie", [-1, -1, 0, -1, -1, -1]), ("t", "catsie", [-1, -1, 0, -1, -1, -1]),
], ],
) )
def test_align_i2j(string1, string2, j2i): def test_align_i2j_2(string1, string2, j2i):
output_cost, output_i2j, output_j2i, matrix = align(string1, string2) output_cost, output_i2j, output_j2i, matrix = align(string1, string2)
assert list(output_j2i) == j2i assert list(output_j2i) == j2i

View File

@ -75,7 +75,7 @@ def test_displacy_spans(en_vocab):
def test_displacy_raises_for_wrong_type(en_vocab): def test_displacy_raises_for_wrong_type(en_vocab):
with pytest.raises(ValueError): with pytest.raises(ValueError):
html = displacy.render("hello world") displacy.render("hello world")
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):

View File

@ -41,21 +41,21 @@ def test_vocab_lexeme_is_digit(en_vocab):
def test_vocab_lexeme_add_flag_auto_id(en_vocab): def test_vocab_lexeme_add_flag_auto_id(en_vocab):
is_len4 = en_vocab.add_flag(lambda string: len(string) == 4) is_len4 = en_vocab.add_flag(lambda string: len(string) == 4)
assert en_vocab["1999"].check_flag(is_len4) == True assert en_vocab["1999"].check_flag(is_len4) is True
assert en_vocab["1999"].check_flag(IS_DIGIT) == True assert en_vocab["1999"].check_flag(IS_DIGIT) is True
assert en_vocab["199"].check_flag(is_len4) == False assert en_vocab["199"].check_flag(is_len4) is False
assert en_vocab["199"].check_flag(IS_DIGIT) == True assert en_vocab["199"].check_flag(IS_DIGIT) is True
assert en_vocab["the"].check_flag(is_len4) == False assert en_vocab["the"].check_flag(is_len4) is False
assert en_vocab["dogs"].check_flag(is_len4) == True assert en_vocab["dogs"].check_flag(is_len4) is True
def test_vocab_lexeme_add_flag_provided_id(en_vocab): def test_vocab_lexeme_add_flag_provided_id(en_vocab):
is_len4 = en_vocab.add_flag(lambda string: len(string) == 4, flag_id=IS_DIGIT) is_len4 = en_vocab.add_flag(lambda string: len(string) == 4, flag_id=IS_DIGIT)
assert en_vocab["1999"].check_flag(is_len4) == True assert en_vocab["1999"].check_flag(is_len4) is True
assert en_vocab["199"].check_flag(is_len4) == False assert en_vocab["199"].check_flag(is_len4) is False
assert en_vocab["199"].check_flag(IS_DIGIT) == False assert en_vocab["199"].check_flag(IS_DIGIT) is False
assert en_vocab["the"].check_flag(is_len4) == False assert en_vocab["the"].check_flag(is_len4) is False
assert en_vocab["dogs"].check_flag(is_len4) == True assert en_vocab["dogs"].check_flag(is_len4) is True
def test_lexeme_bytes_roundtrip(en_vocab): def test_lexeme_bytes_roundtrip(en_vocab):

View File

@ -24,7 +24,7 @@ def test_stringstore_from_api_docs(stringstore):
assert stringstore[apple_hash] == "apple" assert stringstore[apple_hash] == "apple"
assert "apple" in stringstore assert "apple" in stringstore
assert "cherry" not in stringstore assert "cherry" not in stringstore
orange_hash = stringstore.add("orange") stringstore.add("orange")
all_strings = [s for s in stringstore] all_strings = [s for s in stringstore]
assert all_strings == ["apple", "orange"] assert all_strings == ["apple", "orange"]
banana_hash = stringstore.add("banana") banana_hash = stringstore.add("banana")
@ -63,7 +63,7 @@ def test_stringstore_retrieve_id(stringstore, text):
def test_stringstore_med_string(stringstore, text1, text2): def test_stringstore_med_string(stringstore, text1, text2):
store = stringstore.add(text1) store = stringstore.add(text1)
assert stringstore[store] == text1.decode("utf8") assert stringstore[store] == text1.decode("utf8")
dummy = stringstore.add(text2) stringstore.add(text2)
assert stringstore[text1] == store assert stringstore[text1] == store

View File

@ -273,9 +273,9 @@ def test_vocab_add_vector():
def test_vocab_prune_vectors(): def test_vocab_prune_vectors():
vocab = Vocab() vocab = Vocab()
_ = vocab["cat"] _ = vocab["cat"] # noqa: F841
_ = vocab["dog"] _ = vocab["dog"] # noqa: F841
_ = vocab["kitten"] _ = vocab["kitten"] # noqa: F841
data = numpy.ndarray((5, 3), dtype="f") data = numpy.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0
data[1] = 2.0 data[1] = 2.0

View File

@ -42,6 +42,6 @@ def test_vocab_api_symbols(en_vocab, string, symbol):
@pytest.mark.parametrize("text", "Hello") @pytest.mark.parametrize("text", "Hello")
def test_vocab_api_contains(en_vocab, text): def test_vocab_api_contains(en_vocab, text):
_ = en_vocab[text] _ = en_vocab[text] # noqa: F841
assert text in en_vocab assert text in en_vocab
assert "LKsdjvlsakdvlaksdvlkasjdvljasdlkfvm" not in en_vocab assert "LKsdjvlsakdvlaksdvlkasjdvljasdlkfvm" not in en_vocab

View File

@ -46,7 +46,6 @@ class Binder(object):
def get_docs(self, vocab): def get_docs(self, vocab):
"""Recover Doc objects from the annotations, using the given vocab.""" """Recover Doc objects from the annotations, using the given vocab."""
attrs = self.attrs
for string in self.strings: for string in self.strings:
vocab[string] vocab[string]
orth_col = self.attrs.index(ORTH) orth_col = self.attrs.index(ORTH)