Update model fixtures and reorganise tests

This commit is contained in:
ines 2017-05-29 22:14:31 +02:00
parent 795fe43a4d
commit 20a7003c0d
34 changed files with 374 additions and 408 deletions

View File

@ -1,19 +1,40 @@
# coding: utf-8
from __future__ import unicode_literals
from ..tokens import Doc
from ..strings import StringStore
from ..lemmatizer import Lemmatizer
from ..attrs import ORTH, TAG, HEAD, DEP
from .. import util
from io import StringIO, BytesIO
from pathlib import Path
import pytest
from .util import load_test_model
from ..tokens import Doc
from ..strings import StringStore
from .. import util
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
'nl', 'pl', 'pt', 'sv']
'nl', 'pl', 'pt', 'sv', 'xx']
_models = {'en': ['en_core_web_sm', 'en_core_web_md'],
'de': ['de_core_news_md'],
'fr': ['fr_depvec_web_lg'],
'xx': ['xx_ent_web_md']}
# only used for tests that require loading the models
# in all other cases, use specific instances
@pytest.fixture(params=_models['en'], scope="session")
def EN(request):
return load_test_model(request.param)
@pytest.fixture(params=_models['de'], scope="session")
def DE(request):
return load_test_model(request.param)
@pytest.fixture(params=_models['fr'], scope="session")
def FR(request):
return load_test_model(request.param)
@pytest.fixture(params=_languages)
@ -91,11 +112,6 @@ def en_entityrecognizer():
return util.get_lang_class('en').Defaults.create_entity()
@pytest.fixture
def lemmatizer():
return util.get_lang_class('en').Defaults.create_lemmatizer()
@pytest.fixture
def text_file():
return StringIO()
@ -105,22 +121,6 @@ def text_file_b():
return BytesIO()
# only used for tests that require loading the models
# in all other cases, use specific instances
@pytest.fixture(scope="session")
def EN():
return English()
@pytest.fixture(scope="session")
def DE():
return German()
@pytest.fixture(scope="session")
def FR():
return French()
def pytest_addoption(parser):
parser.addoption("--models", action="store_true",
help="include tests that require full models")

View File

@ -0,0 +1,35 @@
# coding: utf-8
from __future__ import unicode_literals
from ...util import get_doc
import pytest
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
text = "Eine Tasse steht auf dem Tisch."
heads = [1, 1, 0, -1, 1, -2, -4]
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
tokens = de_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
chunks = list(doc.noun_chunks)
assert len(chunks) == 2
assert chunks[0].text_with_ws == "Eine Tasse "
assert chunks[1].text_with_ws == "dem Tisch "
def test_de_extended_chunk(de_tokenizer):
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
tokens = de_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
chunks = list(doc.noun_chunks)
assert len(chunks) == 3
assert chunks[0].text_with_ws == "Die Sängerin "
assert chunks[1].text_with_ws == "einer Tasse Kaffee "
assert chunks[2].text_with_ws == "Arien "

View File

@ -1,87 +0,0 @@
# coding: utf-8
"""Test that tokens are created correctly for contractions."""
from __future__ import unicode_literals
import pytest
def test_tokenizer_handles_basic_contraction(en_tokenizer):
text = "don't giggle"
tokens = en_tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "n't"
text = "i said don't!"
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert tokens[4].text == "!"
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
tokens = en_tokenizer(text_poss)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == "'s"
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'"
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].text == text
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'ll"
assert tokens[1].lemma_ == "will"
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
tokens_lower = en_tokenizer(text_lower)
tokens_title = en_tokenizer(text_title)
assert tokens_title[0].text == tokens_lower[0].text.title()
assert tokens_lower[0].text == tokens_title[0].text.lower()
assert tokens_lower[1].text == tokens_title[1].text
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
tokens = en_tokenizer(pron + contraction)
assert tokens[0].text == pron
assert tokens[1].text == contraction
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
tokens = en_tokenizer(exc)
assert len(tokens) == 1
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
tokens = en_tokenizer(wo_punct)
assert len(tokens) == 2
tokens = en_tokenizer(w_punct)
assert len(tokens) == 3

View File

@ -1,19 +1,96 @@
# coding: utf-8
"""Test that tokenizer exceptions are handled correctly."""
from __future__ import unicode_literals
import pytest
def test_en_tokenizer_handles_basic_contraction(en_tokenizer):
text = "don't giggle"
tokens = en_tokenizer(text)
assert len(tokens) == 3
assert tokens[1].text == "n't"
text = "i said don't!"
tokens = en_tokenizer(text)
assert len(tokens) == 5
assert tokens[4].text == "!"
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
tokens = en_tokenizer(text_poss)
assert len(tokens) == 2
assert tokens[0].text == text
assert tokens[1].text == "'s"
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'"
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
assert tokens[0].text == text
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == text.split("'")[0]
assert tokens[1].text == "'ll"
assert tokens[1].lemma_ == "will"
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
tokens_lower = en_tokenizer(text_lower)
tokens_title = en_tokenizer(text_title)
assert tokens_title[0].text == tokens_lower[0].text.title()
assert tokens_lower[0].text == tokens_title[0].text.lower()
assert tokens_lower[1].text == tokens_title[1].text
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
tokens = en_tokenizer(pron + contraction)
assert tokens[0].text == pron
assert tokens[1].text == contraction
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
tokens = en_tokenizer(exc)
assert len(tokens) == 1
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
tokens = en_tokenizer(wo_punct)
assert len(tokens) == 2
tokens = en_tokenizer(w_punct)
assert len(tokens) == 3
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
def test_tokenizer_handles_abbr(en_tokenizer, text):
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 1
def test_tokenizer_handles_exc_in_text(en_tokenizer):
def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
text = "It's mediocre i.e. bad."
tokens = en_tokenizer(text)
assert len(tokens) == 6
@ -21,7 +98,7 @@ def test_tokenizer_handles_exc_in_text(en_tokenizer):
@pytest.mark.parametrize('text', ["1am", "12a.m.", "11p.m.", "4pm"])
def test_tokenizer_handles_times(en_tokenizer, text):
def test_en_tokenizer_handles_times(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[1].lemma_ in ["a.m.", "p.m."]

View File

@ -7,7 +7,7 @@ from __future__ import unicode_literals
import pytest
def test_simple_punct(en_tokenizer):
def test_en_simple_punct(en_tokenizer):
text = "to walk, do foo"
tokens = en_tokenizer(text)
assert tokens[0].idx == 0
@ -17,7 +17,7 @@ def test_simple_punct(en_tokenizer):
assert tokens[4].idx == 12
def test_complex_punct(en_tokenizer):
def test_en_complex_punct(en_tokenizer):
text = "Tom (D., Ill.)!"
tokens = en_tokenizer(text)
assert tokens[0].idx == 0

View File

@ -0,0 +1,46 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.fixture
def en_lemmatizer(EN):
return EN.Defaults.create_lemmatizer()
@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
("aardwolf", ["aardwolf"]),
("planets", ["planet"]),
("ring", ["ring"]),
("axes", ["axis", "axe", "ax"])])
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
assert en_lemmatizer.noun(text) == set(lemmas)
@pytest.mark.xfail
@pytest.mark.models('en')
def test_en_lemmatizer_base_forms(en_lemmatizer):
assert en_lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
@pytest.mark.models
def test_en_lemmatizer_base_form_verb(en_lemmatizer):
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
@pytest.mark.models
def test_en_lemmatizer_punct(en_lemmatizer):
assert en_lemmatizer.punct('') == set(['"'])
assert en_lemmatizer.punct('') == set(['"'])
@pytest.mark.models('en')
def test_en_lemmatizer_lemma_assignment(EN):
text = "Bananas in pyjamas are geese."
doc = EN.tokenizer(text)
assert all(t.lemma_ == '' for t in doc)
EN.tagger(doc)
assert all(t.lemma_ != '' for t in doc)

View File

@ -5,8 +5,8 @@ from spacy.attrs import LOWER
from spacy.matcher import Matcher
@pytest.mark.models
def test_simple_types(EN):
@pytest.mark.models('en')
def test_en_ner_simple_types(EN):
tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)
assert ents[0].start == 1
@ -17,8 +17,8 @@ def test_simple_types(EN):
assert ents[1].label_ == 'GPE'
@pytest.mark.models
def test_consistency_bug(EN):
@pytest.mark.models('en')
def test_en_ner_consistency_bug(EN):
'''Test an arbitrary sequence-consistency bug encountered during speed test'''
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
@ -26,8 +26,8 @@ def test_consistency_bug(EN):
EN.entity(tokens)
@pytest.mark.models
def test_unit_end_gazetteer(EN):
@pytest.mark.models('en')
def test_en_ner_unit_end_gazetteer(EN):
'''Test a bug in the interaction between the NER model and the gazetteer'''
matcher = Matcher(EN.vocab)
matcher.add('MemberNames', None, [{LOWER: 'cal'}], [{LOWER: 'cal'}, {LOWER: 'henderson'}])
@ -38,6 +38,3 @@ def test_unit_end_gazetteer(EN):
doc.ents += tuple(ents)
EN.entity(doc)
assert list(doc.ents)[0].text == 'cal'

View File

@ -1,7 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
from ..util import get_doc
from ...util import get_doc
import pytest
@ -45,32 +45,3 @@ def test_parser_noun_chunks_pp_chunks(en_tokenizer):
assert len(chunks) == 2
assert chunks[0].text_with_ws == "A phrase "
assert chunks[1].text_with_ws == "another phrase "
def test_parser_noun_chunks_standard_de(de_tokenizer):
text = "Eine Tasse steht auf dem Tisch."
heads = [1, 1, 0, -1, 1, -2, -4]
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
tokens = de_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
chunks = list(doc.noun_chunks)
assert len(chunks) == 2
assert chunks[0].text_with_ws == "Eine Tasse "
assert chunks[1].text_with_ws == "dem Tisch "
def test_de_extended_chunk(de_tokenizer):
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
tokens = de_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
chunks = list(doc.noun_chunks)
assert len(chunks) == 3
assert chunks[0].text_with_ws == "Die Sängerin "
assert chunks[1].text_with_ws == "einer Tasse Kaffee "
assert chunks[2].text_with_ws == "Arien "

View File

@ -16,14 +16,14 @@ PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
@pytest.mark.parametrize('text', ["(", "((", "<"])
def test_tokenizer_handles_only_punct(en_tokenizer, text):
def test_en_tokenizer_handles_only_punct(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == len(text)
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(punct + text)
assert len(tokens) == 2
assert tokens[0].text == punct
@ -32,7 +32,7 @@ def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(text + punct)
assert len(tokens) == 2
assert tokens[0].text == text
@ -42,7 +42,7 @@ def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('punct_add', ["`"])
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
tokens = en_tokenizer(punct + punct_add + text)
assert len(tokens) == 3
assert tokens[0].text == punct
@ -53,7 +53,7 @@ def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, te
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('punct_add', ["'"])
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
tokens = en_tokenizer(text + punct + punct_add)
assert len(tokens) == 3
assert tokens[0].text == text
@ -63,7 +63,7 @@ def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, t
@pytest.mark.parametrize('punct', PUNCT_OPEN)
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(punct + punct + punct + text)
assert len(tokens) == 4
assert tokens[0].text == punct
@ -72,7 +72,7 @@ def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
tokens = en_tokenizer(text + punct + punct + punct)
assert len(tokens) == 4
assert tokens[0].text == text
@ -80,14 +80,14 @@ def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
@pytest.mark.parametrize('text', ["'The"])
def test_tokenizer_splits_open_appostrophe(en_tokenizer, text):
def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
assert tokens[0].text == "'"
@pytest.mark.parametrize('text', ["Hello''"])
def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2
tokens_punct = en_tokenizer("''")
@ -96,7 +96,7 @@ def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
punct_close, text):
tokens = en_tokenizer(punct_open + text + punct_close)
assert len(tokens) == 3
@ -108,7 +108,7 @@ def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
@pytest.mark.parametrize('text', ["Hello"])
def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
punct_open2, punct_close2, text):
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
assert len(tokens) == 5
@ -120,13 +120,13 @@ def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
def test_tokenizer_splits_pre_punct_regex(text, punct):
def test_en_tokenizer_splits_pre_punct_regex(text, punct):
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
match = en_search_prefixes(text)
assert match.group() == punct
def test_tokenizer_splits_bracket_period(en_tokenizer):
def test_en_tokenizer_splits_bracket_period(en_tokenizer):
text = "(And a 6a.m. run through Washington Park)."
tokens = en_tokenizer(text)
assert tokens[len(tokens) - 1].text == "."

View File

@ -1,9 +1,65 @@
# encoding: utf-8
# coding: utf-8
from __future__ import unicode_literals
from ....tokens import Doc
from ...util import get_doc, apply_transition_sequence
import pytest
@pytest.mark.parametrize('text', ["A test sentence"])
@pytest.mark.parametrize('punct', ['.', '!', '?', ''])
def test_en_sbd_single_punct(en_tokenizer, text, punct):
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
tokens = en_tokenizer(text + punct)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
assert len(doc) == 4 if punct else 3
assert len(list(doc.sents)) == 1
assert sum(len(sent) for sent in doc.sents) == len(doc)
@pytest.mark.xfail
def test_en_sentence_breaks(en_tokenizer, en_parser):
text = "This is a sentence . This is another one ."
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
'attr', 'punct']
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT',
'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct']
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
apply_transition_sequence(en_parser, doc, transition)
assert len(list(doc.sents)) == 2
for token in doc:
assert token.dep != 0 or token.is_space
assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
# Currently, there's no way of setting the serializer data for the parser
# without loading the models, so we can't remove the model dependency here yet.
@pytest.mark.xfail
@pytest.mark.models('en')
def test_en_sbd_serialization_projective(EN):
"""Test that before and after serialization, the sentence boundaries are
the same."""
text = "I bought a couch from IKEA It wasn't very comfortable."
transition = ['L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj',
'B-ROOT', 'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod',
'R-acomp', 'D', 'R-punct']
doc = EN.tokenizer(text)
apply_transition_sequence(EN.parser, doc, transition)
doc_serialized = Doc(EN.vocab).from_bytes(doc.to_bytes())
assert doc.is_parsed == True
assert doc_serialized.is_parsed == True
assert doc.to_bytes() == doc_serialized.to_bytes()
assert [s.text for s in doc.sents] == [s.text for s in doc_serialized.sents]
TEST_CASES = [
("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]),
("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
@ -59,10 +115,9 @@ TEST_CASES = [
pytest.mark.xfail(("Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.", ["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."]))
]
@pytest.mark.slow
@pytest.mark.models
@pytest.mark.models('en')
@pytest.mark.parametrize('text,expected_sents', TEST_CASES)
def test_parser_sbd_prag(EN, text, expected_sents):
def test_en_sbd_prag(EN, text, expected_sents):
"""SBD tests from Pragmatic Segmenter"""
doc = EN(text)
sents = []

View File

@ -0,0 +1,59 @@
# coding: utf-8
from __future__ import unicode_literals
from ....parts_of_speech import SPACE
from ...util import get_doc
import six
import pytest
def test_en_tagger_load_morph_exc(en_tokenizer):
text = "I like his style."
tags = ['PRP', 'VBP', 'PRP$', 'NN', '.']
morph_exc = {'VBP': {'like': {'L': 'luck'}}}
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags)
assert doc[1].tag_ == 'VBP'
assert doc[1].lemma_ == 'luck'
@pytest.mark.models('en')
def test_tag_names(EN):
text = "I ate pizzas with anchovies."
doc = EN(text, parse=False, tag=True)
assert type(doc[2].pos) == int
assert isinstance(doc[2].pos_, six.text_type)
assert type(doc[2].dep) == int
assert isinstance(doc[2].dep_, six.text_type)
assert doc[2].tag_ == u'NNS'
@pytest.mark.models('en')
def test_en_tagger_spaces(EN):
"""Ensure spaces are assigned the POS tag SPACE"""
text = "Some\nspaces are\tnecessary."
doc = EN(text, tag=True, parse=False)
assert doc[0].pos != SPACE
assert doc[0].pos_ != 'SPACE'
assert doc[1].pos == SPACE
assert doc[1].pos_ == 'SPACE'
assert doc[1].tag_ == 'SP'
assert doc[2].pos != SPACE
assert doc[3].pos != SPACE
assert doc[4].pos == SPACE
@pytest.mark.models('en')
def test_en_tagger_return_char(EN):
"""Ensure spaces are assigned the POS tag SPACE"""
text = ('hi Aaron,\r\n\r\nHow is your schedule today, I was wondering if '
'you had time for a phone\r\ncall this afternoon?\r\n\r\n\r\n')
tokens = EN(text)
for token in tokens:
if token.is_space:
assert token.pos == SPACE
assert tokens[3].text == '\r\n\r\n'
assert tokens[3].is_space
assert tokens[3].pos == SPACE

View File

@ -7,7 +7,7 @@ from __future__ import unicode_literals
import pytest
def test_tokenizer_handles_long_text(en_tokenizer):
def test_en_tokenizer_handles_long_text(en_tokenizer):
text = """Tributes pour in for late British Labour Party leader
Tributes poured in from around the world Thursday
@ -30,7 +30,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
("""'Me too!', Mr. P. Delaware cried. """, 11),
("They ran about 10km.", 6),
pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))])
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
def test_en_tokenizer_handles_cnts(en_tokenizer, text, length):
tokens = en_tokenizer(text)
assert len(tokens) == length

View File

@ -1,37 +1,33 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.models
@pytest.mark.models('fr')
def test_lemmatizer_verb(FR):
text = "Qu'est-ce que tu fais?"
tokens = FR(text)
tokens = FR("Qu'est-ce que tu fais?")
assert tokens[0].lemma_ == "que"
assert tokens[1].lemma_ == "être"
assert tokens[5].lemma_ == "faire"
@pytest.mark.models
@pytest.mark.models('fr')
@pytest.mark.xfail(reason="sont tagged as AUX")
def test_lemmatizer_noun_verb_2(FR):
text = "Les abaissements de température sont gênants."
tokens = FR(text)
tokens = FR("Les abaissements de température sont gênants.")
assert tokens[4].lemma_ == "être"
@pytest.mark.models
@pytest.mark.models('fr')
@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN")
def test_lemmatizer_noun(FR):
text = "il y a des Costaricienne."
tokens = FR(text)
def test_lemmatizer_noun(model):
tokens = FR("il y a des Costaricienne.")
assert tokens[4].lemma_ == "Costaricain"
@pytest.mark.models
@pytest.mark.models('fr')
def test_lemmatizer_noun_2(FR):
text = "Les abaissements de température sont gênants."
tokens = FR(text)
tokens = FR("Les abaissements de température sont gênants.")
assert tokens[1].lemma_ == "abaissement"
assert tokens[5].lemma_ == "gênant"

View File

@ -1,60 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from ...tokens import Doc
from ..util import get_doc, apply_transition_sequence
import pytest
@pytest.mark.parametrize('text', ["A test sentence"])
@pytest.mark.parametrize('punct', ['.', '!', '?', ''])
def test_parser_sbd_single_punct(en_tokenizer, text, punct):
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
tokens = en_tokenizer(text + punct)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
assert len(doc) == 4 if punct else 3
assert len(list(doc.sents)) == 1
assert sum(len(sent) for sent in doc.sents) == len(doc)
@pytest.mark.xfail
def test_parser_sentence_breaks(en_tokenizer, en_parser):
text = "This is a sentence . This is another one ."
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
'attr', 'punct']
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT',
'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct']
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
apply_transition_sequence(en_parser, doc, transition)
assert len(list(doc.sents)) == 2
for token in doc:
assert token.dep != 0 or token.is_space
assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
# Currently, there's no way of setting the serializer data for the parser
# without loading the models, so we can't remove the model dependency here yet.
@pytest.mark.xfail
@pytest.mark.models
def test_parser_sbd_serialization_projective(EN):
"""Test that before and after serialization, the sentence boundaries are
the same."""
text = "I bought a couch from IKEA It wasn't very comfortable."
transition = ['L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj',
'B-ROOT', 'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod',
'R-acomp', 'D', 'R-punct']
doc = EN.tokenizer(text)
apply_transition_sequence(EN.parser, doc, transition)
doc_serialized = Doc(EN.vocab).from_bytes(doc.to_bytes())
assert doc.is_parsed == True
assert doc_serialized.is_parsed == True
assert doc.to_bytes() == doc_serialized.to_bytes()
assert [s.text for s in doc.sents] == [s.text for s in doc_serialized.sents]

View File

@ -1,17 +1,11 @@
import pytest
from ...pipeline import NeuralDependencyParser
from ...vocab import Vocab
@pytest.fixture
def vocab():
return Vocab()
@pytest.fixture
def parser(vocab):
parser = NeuralDependencyParser(vocab)
def parser(en_vocab):
parser = NeuralDependencyParser(en_vocab)
parser.add_label('nsubj')
parser.model, cfg = parser.Model(parser.moves.n_moves)
parser.cfg.update(cfg)
@ -19,8 +13,8 @@ def parser(vocab):
@pytest.fixture
def blank_parser(vocab):
parser = NeuralDependencyParser(vocab)
def blank_parser(en_vocab):
parser = NeuralDependencyParser(en_vocab)
return parser

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.models
@pytest.mark.models('en')
@pytest.mark.parametrize('text,i', [("Jane's got a new car", 1),
("Jane thinks that's a nice car", 3)])
def test_issue401(EN, text, i):

View File

@ -6,7 +6,7 @@ from ...matcher import Matcher
import pytest
@pytest.mark.models
@pytest.mark.models('en')
def test_issue429(EN):
def merge_phrases(matcher, doc, i, matches):
if i != len(matches) - 1:

View File

@ -6,7 +6,7 @@ from ..util import get_doc
import pytest
@pytest.mark.models
@pytest.mark.models('en')
def test_issue514(EN):
"""Test serializing after adding entity"""
text = ["This", "is", "a", "sentence", "about", "pasta", "."]

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.models
@pytest.mark.models('en')
def test_issue54(EN):
text = "Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1)."
tokens = EN(text)

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.models
@pytest.mark.models('en')
@pytest.mark.parametrize('text', ["He is the man", "he is the man"])
def test_issue686(EN, text):
"""Test that pronoun lemmas are assigned correctly."""

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.models
@pytest.mark.models('en')
def test_issue693(EN):
"""Test that doc.noun_chunks parses the complete sentence."""

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.models
@pytest.mark.models('en')
def test_issue704(EN):
"""Test that sentence boundaries are detected correctly."""

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.models
@pytest.mark.models('en')
@pytest.mark.parametrize('text1,text2',
[("You're happy", "You are happy"),
("I'm happy", "I am happy"),

View File

@ -4,7 +4,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.models
@pytest.mark.models('en')
@pytest.mark.parametrize('text', ["s..."])
def test_issue719(EN, text):
"""Test that the token 's' is not lemmatized into empty string."""

View File

@ -4,7 +4,7 @@ import pytest
@pytest.mark.xfail
@pytest.mark.models
@pytest.mark.models('en')
def test_issue758(EN):
'''Test parser transition bug after label added.'''
from ...matcher import merge_phrase

View File

@ -5,6 +5,8 @@ import pytest
# Note: "chromosomes" worked previous the bug fix
@pytest.mark.models('en')
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
def test_issue781(lemmatizer, word, lemmas):
def test_issue781(EN, word, lemmas):
lemmatizer = EN.Defaults.create_lemmatizer()
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)

View File

@ -70,8 +70,8 @@ def temp_save_model(model):
@pytest.mark.models
def test_issue910(train_data, additional_entity_types):
@pytest.mark.models('en')
def test_issue910(EN, train_data, additional_entity_types):
'''Test that adding entities and resuming training works passably OK.
There are two issues here:
@ -79,8 +79,7 @@ def test_issue910(train_data, additional_entity_types):
2) There's no way to set the learning rate for the weight update, so we
end up out-of-scale, causing it to learn too fast.
'''
nlp = English()
doc = nlp(u"I am looking for a restaurant in Berlin")
doc = EN(u"I am looking for a restaurant in Berlin")
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
# Fine tune the ner model
for entity_type in additional_entity_types:

View File

@ -1,18 +1,13 @@
from __future__ import unicode_literals
import pytest
from ... import load as load_spacy
@pytest.fixture
def doc():
nlp = load_spacy('en')
return nlp('Does flight number three fifty-four require a connecting flight'
' to get to Boston?')
@pytest.mark.models
def test_issue955(doc):
@pytest.mark.models('en')
def test_issue955(EN, doc):
'''Test that we don't have any nested noun chunks'''
doc = EN('Does flight number three fifty-four require a connecting flight'
' to get to Boston?')
seen_tokens = set()
for np in doc.noun_chunks:
print(np.text, np.root.text, np.root.dep_, np.root.tag_)

View File

@ -1,49 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.mark.models
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
("aardwolf", ["aardwolf"]),
("planets", ["planet"]),
("ring", ["ring"]),
("axes", ["axis", "axe", "ax"])])
def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas):
if lemmatizer is None:
return None
assert lemmatizer.noun(text) == set(lemmas)
@pytest.mark.xfail
@pytest.mark.models
def test_tagger_lemmatizer_base_forms(lemmatizer):
if lemmatizer is None:
return None
assert lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
assert lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
@pytest.mark.models
def test_tagger_lemmatizer_base_form_verb(lemmatizer):
if lemmatizer is None:
return None
assert lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
@pytest.mark.models
def test_tagger_lemmatizer_punct(lemmatizer):
if lemmatizer is None:
return None
assert lemmatizer.punct('') == set(['"'])
assert lemmatizer.punct('') == set(['"'])
@pytest.mark.models
def test_tagger_lemmatizer_lemma_assignment(EN):
text = "Bananas in pyjamas are geese."
doc = EN.tokenizer(text)
assert all(t.lemma_ == '' for t in doc)
EN.tagger(doc)
assert all(t.lemma_ != '' for t in doc)

View File

@ -1,17 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
from ..util import get_doc
import pytest
def test_tagger_load_morph_exc(en_tokenizer):
text = "I like his style."
tags = ['PRP', 'VBP', 'PRP$', 'NN', '.']
morph_exc = {'VBP': {'like': {'L': 'luck'}}}
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags)
assert doc[1].tag_ == 'VBP'
assert doc[1].lemma_ == 'luck'

View File

@ -1,35 +0,0 @@
# coding: utf-8
"""Ensure spaces are assigned the POS tag SPACE"""
from __future__ import unicode_literals
from ...parts_of_speech import SPACE
import pytest
@pytest.mark.models
def test_tagger_spaces(EN):
text = "Some\nspaces are\tnecessary."
doc = EN(text, tag=True, parse=False)
assert doc[0].pos != SPACE
assert doc[0].pos_ != 'SPACE'
assert doc[1].pos == SPACE
assert doc[1].pos_ == 'SPACE'
assert doc[1].tag_ == 'SP'
assert doc[2].pos != SPACE
assert doc[3].pos != SPACE
assert doc[4].pos == SPACE
@pytest.mark.models
def test_tagger_return_char(EN):
text = ('hi Aaron,\r\n\r\nHow is your schedule today, I was wondering if '
'you had time for a phone\r\ncall this afternoon?\r\n\r\n\r\n')
tokens = EN(text)
for token in tokens:
if token.is_space:
assert token.pos == SPACE
assert tokens[3].text == '\r\n\r\n'
assert tokens[3].is_space
assert tokens[3].pos == SPACE

View File

@ -1,16 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import six
import pytest
@pytest.mark.models
def test_tag_names(EN):
text = "I ate pizzas with anchovies."
doc = EN(text, parse=False, tag=True)
assert type(doc[2].pos) == int
assert isinstance(doc[2].pos_, six.text_type)
assert type(doc[2].dep) == int
assert isinstance(doc[2].dep_, six.text_type)
assert doc[2].tag_ == u'NNS'

View File

@ -20,6 +20,7 @@ def test_util_ensure_path_succeeds(text):
assert isinstance(path, Path)
@pytest.mark.models
def test_simple_model_roundtrip_bytes():
model = Maxout(5, 10, pieces=2)
model.b += 1
@ -29,6 +30,7 @@ def test_simple_model_roundtrip_bytes():
assert model.b[0, 0] == 1
@pytest.mark.models
def test_multi_model_roundtrip_bytes():
model = chain(Maxout(5, 10, pieces=2), Maxout(2, 3))
model._layers[0].b += 1
@ -41,6 +43,7 @@ def test_multi_model_roundtrip_bytes():
assert model._layers[1].b[0, 0] == 2
@pytest.mark.models
def test_multi_model_load_missing_dims():
model = chain(Maxout(5, 10, pieces=2), Maxout(2, 3))
model._layers[0].b += 1
@ -52,6 +55,7 @@ def test_multi_model_load_missing_dims():
assert model2._layers[0].b[0, 0] == 1
assert model2._layers[1].b[0, 0] == 2
@pytest.mark.parametrize('package', ['thinc'])
def test_util_is_package(package):
"""Test that an installed package via pip is recognised by util.is_package."""