mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Update model fixtures and reorganise tests
This commit is contained in:
parent
795fe43a4d
commit
20a7003c0d
|
@ -1,19 +1,40 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..tokens import Doc
|
||||
from ..strings import StringStore
|
||||
from ..lemmatizer import Lemmatizer
|
||||
from ..attrs import ORTH, TAG, HEAD, DEP
|
||||
from .. import util
|
||||
|
||||
from io import StringIO, BytesIO
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from .util import load_test_model
|
||||
from ..tokens import Doc
|
||||
from ..strings import StringStore
|
||||
from .. import util
|
||||
|
||||
|
||||
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
|
||||
'nl', 'pl', 'pt', 'sv']
|
||||
'nl', 'pl', 'pt', 'sv', 'xx']
|
||||
_models = {'en': ['en_core_web_sm', 'en_core_web_md'],
|
||||
'de': ['de_core_news_md'],
|
||||
'fr': ['fr_depvec_web_lg'],
|
||||
'xx': ['xx_ent_web_md']}
|
||||
|
||||
|
||||
# only used for tests that require loading the models
|
||||
# in all other cases, use specific instances
|
||||
|
||||
@pytest.fixture(params=_models['en'], scope="session")
|
||||
def EN(request):
|
||||
return load_test_model(request.param)
|
||||
|
||||
|
||||
@pytest.fixture(params=_models['de'], scope="session")
|
||||
def DE(request):
|
||||
return load_test_model(request.param)
|
||||
|
||||
|
||||
@pytest.fixture(params=_models['fr'], scope="session")
|
||||
def FR(request):
|
||||
return load_test_model(request.param)
|
||||
|
||||
|
||||
@pytest.fixture(params=_languages)
|
||||
|
@ -91,11 +112,6 @@ def en_entityrecognizer():
|
|||
return util.get_lang_class('en').Defaults.create_entity()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lemmatizer():
|
||||
return util.get_lang_class('en').Defaults.create_lemmatizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def text_file():
|
||||
return StringIO()
|
||||
|
@ -105,22 +121,6 @@ def text_file_b():
|
|||
return BytesIO()
|
||||
|
||||
|
||||
# only used for tests that require loading the models
|
||||
# in all other cases, use specific instances
|
||||
@pytest.fixture(scope="session")
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def DE():
|
||||
return German()
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def FR():
|
||||
return French()
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption("--models", action="store_true",
|
||||
help="include tests that require full models")
|
||||
|
|
35
spacy/tests/lang/de/test_parser.py
Normal file
35
spacy/tests/lang/de/test_parser.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...util import get_doc
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
|
||||
text = "Eine Tasse steht auf dem Tisch."
|
||||
heads = [1, 1, 0, -1, 1, -2, -4]
|
||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
|
||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
|
||||
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "Eine Tasse "
|
||||
assert chunks[1].text_with_ws == "dem Tisch "
|
||||
|
||||
|
||||
def test_de_extended_chunk(de_tokenizer):
|
||||
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
|
||||
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
|
||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
|
||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
|
||||
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "Die Sängerin "
|
||||
assert chunks[1].text_with_ws == "einer Tasse Kaffee "
|
||||
assert chunks[2].text_with_ws == "Arien "
|
|
@ -1,87 +0,0 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokens are created correctly for contractions."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tokenizer_handles_basic_contraction(en_tokenizer):
|
||||
text = "don't giggle"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == "n't"
|
||||
text = "i said don't!"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].text == "!"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
||||
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
|
||||
def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
||||
tokens = en_tokenizer(text_poss)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == "'s"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
|
||||
def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text.split("'")[0]
|
||||
assert tokens[1].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
|
||||
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
|
||||
def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text.split("'")[0]
|
||||
assert tokens[1].text == "'ll"
|
||||
assert tokens[1].lemma_ == "will"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
|
||||
def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
||||
tokens_lower = en_tokenizer(text_lower)
|
||||
tokens_title = en_tokenizer(text_title)
|
||||
assert tokens_title[0].text == tokens_lower[0].text.title()
|
||||
assert tokens_lower[0].text == tokens_title[0].text.lower()
|
||||
assert tokens_lower[1].text == tokens_title[1].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
|
||||
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
|
||||
def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
||||
tokens = en_tokenizer(pron + contraction)
|
||||
assert tokens[0].text == pron
|
||||
assert tokens[1].text == contraction
|
||||
|
||||
|
||||
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
||||
def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
||||
tokens = en_tokenizer(exc)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
|
||||
def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
||||
tokens = en_tokenizer(wo_punct)
|
||||
assert len(tokens) == 2
|
||||
tokens = en_tokenizer(w_punct)
|
||||
assert len(tokens) == 3
|
|
@ -1,19 +1,96 @@
|
|||
# coding: utf-8
|
||||
"""Test that tokenizer exceptions are handled correctly."""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_en_tokenizer_handles_basic_contraction(en_tokenizer):
|
||||
text = "don't giggle"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].text == "n't"
|
||||
text = "i said don't!"
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[4].text == "!"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
||||
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
|
||||
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
||||
tokens = en_tokenizer(text_poss)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text
|
||||
assert tokens[1].text == "'s"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
|
||||
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text.split("'")[0]
|
||||
assert tokens[1].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
|
||||
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].text == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
|
||||
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text.split("'")[0]
|
||||
assert tokens[1].text == "'ll"
|
||||
assert tokens[1].lemma_ == "will"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
|
||||
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
||||
tokens_lower = en_tokenizer(text_lower)
|
||||
tokens_title = en_tokenizer(text_title)
|
||||
assert tokens_title[0].text == tokens_lower[0].text.title()
|
||||
assert tokens_lower[0].text == tokens_title[0].text.lower()
|
||||
assert tokens_lower[1].text == tokens_title[1].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
|
||||
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
|
||||
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
||||
tokens = en_tokenizer(pron + contraction)
|
||||
assert tokens[0].text == pron
|
||||
assert tokens[1].text == contraction
|
||||
|
||||
|
||||
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
||||
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
||||
tokens = en_tokenizer(exc)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
|
||||
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
||||
tokens = en_tokenizer(wo_punct)
|
||||
assert len(tokens) == 2
|
||||
tokens = en_tokenizer(w_punct)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
|
||||
def test_tokenizer_handles_abbr(en_tokenizer, text):
|
||||
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
||||
def test_tokenizer_handles_exc_in_text(en_tokenizer):
|
||||
def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
|
||||
text = "It's mediocre i.e. bad."
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 6
|
||||
|
@ -21,7 +98,7 @@ def test_tokenizer_handles_exc_in_text(en_tokenizer):
|
|||
|
||||
|
||||
@pytest.mark.parametrize('text', ["1am", "12a.m.", "11p.m.", "4pm"])
|
||||
def test_tokenizer_handles_times(en_tokenizer, text):
|
||||
def test_en_tokenizer_handles_times(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].lemma_ in ["a.m.", "p.m."]
|
||||
|
|
|
@ -7,7 +7,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
def test_simple_punct(en_tokenizer):
|
||||
def test_en_simple_punct(en_tokenizer):
|
||||
text = "to walk, do foo"
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].idx == 0
|
||||
|
@ -17,7 +17,7 @@ def test_simple_punct(en_tokenizer):
|
|||
assert tokens[4].idx == 12
|
||||
|
||||
|
||||
def test_complex_punct(en_tokenizer):
|
||||
def test_en_complex_punct(en_tokenizer):
|
||||
text = "Tom (D., Ill.)!"
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[0].idx == 0
|
||||
|
|
46
spacy/tests/lang/en/test_lemmatizer.py
Normal file
46
spacy/tests/lang/en/test_lemmatizer.py
Normal file
|
@ -0,0 +1,46 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def en_lemmatizer(EN):
|
||||
return EN.Defaults.create_lemmatizer()
|
||||
|
||||
|
||||
@pytest.mark.models('en')
|
||||
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
|
||||
("aardwolf", ["aardwolf"]),
|
||||
("planets", ["planet"]),
|
||||
("ring", ["ring"]),
|
||||
("axes", ["axis", "axe", "ax"])])
|
||||
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
||||
assert en_lemmatizer.noun(text) == set(lemmas)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.models('en')
|
||||
def test_en_lemmatizer_base_forms(en_lemmatizer):
|
||||
assert en_lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
|
||||
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_en_lemmatizer_base_form_verb(en_lemmatizer):
|
||||
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_en_lemmatizer_punct(en_lemmatizer):
|
||||
assert en_lemmatizer.punct('“') == set(['"'])
|
||||
assert en_lemmatizer.punct('“') == set(['"'])
|
||||
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_en_lemmatizer_lemma_assignment(EN):
|
||||
text = "Bananas in pyjamas are geese."
|
||||
doc = EN.tokenizer(text)
|
||||
assert all(t.lemma_ == '' for t in doc)
|
||||
EN.tagger(doc)
|
||||
assert all(t.lemma_ != '' for t in doc)
|
|
@ -5,8 +5,8 @@ from spacy.attrs import LOWER
|
|||
from spacy.matcher import Matcher
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_simple_types(EN):
|
||||
@pytest.mark.models('en')
|
||||
def test_en_ner_simple_types(EN):
|
||||
tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
|
||||
ents = list(tokens.ents)
|
||||
assert ents[0].start == 1
|
||||
|
@ -17,8 +17,8 @@ def test_simple_types(EN):
|
|||
assert ents[1].label_ == 'GPE'
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_consistency_bug(EN):
|
||||
@pytest.mark.models('en')
|
||||
def test_en_ner_consistency_bug(EN):
|
||||
'''Test an arbitrary sequence-consistency bug encountered during speed test'''
|
||||
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
|
||||
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
|
||||
|
@ -26,8 +26,8 @@ def test_consistency_bug(EN):
|
|||
EN.entity(tokens)
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_unit_end_gazetteer(EN):
|
||||
@pytest.mark.models('en')
|
||||
def test_en_ner_unit_end_gazetteer(EN):
|
||||
'''Test a bug in the interaction between the NER model and the gazetteer'''
|
||||
matcher = Matcher(EN.vocab)
|
||||
matcher.add('MemberNames', None, [{LOWER: 'cal'}], [{LOWER: 'cal'}, {LOWER: 'henderson'}])
|
||||
|
@ -38,6 +38,3 @@ def test_unit_end_gazetteer(EN):
|
|||
doc.ents += tuple(ents)
|
||||
EN.entity(doc)
|
||||
assert list(doc.ents)[0].text == 'cal'
|
||||
|
||||
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..util import get_doc
|
||||
from ...util import get_doc
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -45,32 +45,3 @@ def test_parser_noun_chunks_pp_chunks(en_tokenizer):
|
|||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "A phrase "
|
||||
assert chunks[1].text_with_ws == "another phrase "
|
||||
|
||||
|
||||
def test_parser_noun_chunks_standard_de(de_tokenizer):
|
||||
text = "Eine Tasse steht auf dem Tisch."
|
||||
heads = [1, 1, 0, -1, 1, -2, -4]
|
||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
|
||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
|
||||
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].text_with_ws == "Eine Tasse "
|
||||
assert chunks[1].text_with_ws == "dem Tisch "
|
||||
|
||||
|
||||
def test_de_extended_chunk(de_tokenizer):
|
||||
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
|
||||
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
|
||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
|
||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
|
||||
|
||||
tokens = de_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert len(chunks) == 3
|
||||
assert chunks[0].text_with_ws == "Die Sängerin "
|
||||
assert chunks[1].text_with_ws == "einer Tasse Kaffee "
|
||||
assert chunks[2].text_with_ws == "Arien "
|
|
@ -16,14 +16,14 @@ PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
|||
|
||||
|
||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
||||
def test_tokenizer_handles_only_punct(en_tokenizer, text):
|
||||
def test_en_tokenizer_handles_only_punct(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == len(text)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
||||
def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(punct + text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == punct
|
||||
|
@ -32,7 +32,7 @@ def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
|||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
||||
def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(text + punct)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == text
|
||||
|
@ -42,7 +42,7 @@ def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
|||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('punct_add', ["`"])
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
|
||||
def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
|
||||
tokens = en_tokenizer(punct + punct_add + text)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == punct
|
||||
|
@ -53,7 +53,7 @@ def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, te
|
|||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('punct_add', ["'"])
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
|
||||
def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
|
||||
tokens = en_tokenizer(text + punct + punct_add)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].text == text
|
||||
|
@ -63,7 +63,7 @@ def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, t
|
|||
|
||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
||||
def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(punct + punct + punct + text)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].text == punct
|
||||
|
@ -72,7 +72,7 @@ def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
|||
|
||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
||||
def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
||||
tokens = en_tokenizer(text + punct + punct + punct)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].text == text
|
||||
|
@ -80,14 +80,14 @@ def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
|||
|
||||
|
||||
@pytest.mark.parametrize('text', ["'The"])
|
||||
def test_tokenizer_splits_open_appostrophe(en_tokenizer, text):
|
||||
def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].text == "'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["Hello''"])
|
||||
def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
||||
def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == 2
|
||||
tokens_punct = en_tokenizer("''")
|
||||
|
@ -96,7 +96,7 @@ def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
|||
|
||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
||||
def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
||||
punct_close, text):
|
||||
tokens = en_tokenizer(punct_open + text + punct_close)
|
||||
assert len(tokens) == 3
|
||||
|
@ -108,7 +108,7 @@ def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
|||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
||||
@pytest.mark.parametrize('text', ["Hello"])
|
||||
def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
||||
def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
||||
punct_open2, punct_close2, text):
|
||||
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||
assert len(tokens) == 5
|
||||
|
@ -120,13 +120,13 @@ def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
|||
|
||||
|
||||
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
||||
def test_tokenizer_splits_pre_punct_regex(text, punct):
|
||||
def test_en_tokenizer_splits_pre_punct_regex(text, punct):
|
||||
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||
match = en_search_prefixes(text)
|
||||
assert match.group() == punct
|
||||
|
||||
|
||||
def test_tokenizer_splits_bracket_period(en_tokenizer):
|
||||
def test_en_tokenizer_splits_bracket_period(en_tokenizer):
|
||||
text = "(And a 6a.m. run through Washington Park)."
|
||||
tokens = en_tokenizer(text)
|
||||
assert tokens[len(tokens) - 1].text == "."
|
||||
|
|
|
@ -1,9 +1,65 @@
|
|||
# encoding: utf-8
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ....tokens import Doc
|
||||
from ...util import get_doc, apply_transition_sequence
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["A test sentence"])
|
||||
@pytest.mark.parametrize('punct', ['.', '!', '?', ''])
|
||||
def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
||||
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
||||
tokens = en_tokenizer(text + punct)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||
assert len(doc) == 4 if punct else 3
|
||||
assert len(list(doc.sents)) == 1
|
||||
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_en_sentence_breaks(en_tokenizer, en_parser):
|
||||
text = "This is a sentence . This is another one ."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
|
||||
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
|
||||
'attr', 'punct']
|
||||
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT',
|
||||
'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct']
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
||||
apply_transition_sequence(en_parser, doc, transition)
|
||||
|
||||
assert len(list(doc.sents)) == 2
|
||||
for token in doc:
|
||||
assert token.dep != 0 or token.is_space
|
||||
assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
|
||||
|
||||
|
||||
# Currently, there's no way of setting the serializer data for the parser
|
||||
# without loading the models, so we can't remove the model dependency here yet.
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.models('en')
|
||||
def test_en_sbd_serialization_projective(EN):
|
||||
"""Test that before and after serialization, the sentence boundaries are
|
||||
the same."""
|
||||
|
||||
text = "I bought a couch from IKEA It wasn't very comfortable."
|
||||
transition = ['L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj',
|
||||
'B-ROOT', 'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod',
|
||||
'R-acomp', 'D', 'R-punct']
|
||||
|
||||
doc = EN.tokenizer(text)
|
||||
apply_transition_sequence(EN.parser, doc, transition)
|
||||
doc_serialized = Doc(EN.vocab).from_bytes(doc.to_bytes())
|
||||
assert doc.is_parsed == True
|
||||
assert doc_serialized.is_parsed == True
|
||||
assert doc.to_bytes() == doc_serialized.to_bytes()
|
||||
assert [s.text for s in doc.sents] == [s.text for s in doc_serialized.sents]
|
||||
|
||||
|
||||
TEST_CASES = [
|
||||
("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]),
|
||||
("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
|
||||
|
@ -59,10 +115,9 @@ TEST_CASES = [
|
|||
pytest.mark.xfail(("Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.", ["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."]))
|
||||
]
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('en')
|
||||
@pytest.mark.parametrize('text,expected_sents', TEST_CASES)
|
||||
def test_parser_sbd_prag(EN, text, expected_sents):
|
||||
def test_en_sbd_prag(EN, text, expected_sents):
|
||||
"""SBD tests from Pragmatic Segmenter"""
|
||||
doc = EN(text)
|
||||
sents = []
|
59
spacy/tests/lang/en/test_tagger.py
Normal file
59
spacy/tests/lang/en/test_tagger.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ....parts_of_speech import SPACE
|
||||
from ...util import get_doc
|
||||
|
||||
import six
|
||||
import pytest
|
||||
|
||||
|
||||
def test_en_tagger_load_morph_exc(en_tokenizer):
|
||||
text = "I like his style."
|
||||
tags = ['PRP', 'VBP', 'PRP$', 'NN', '.']
|
||||
morph_exc = {'VBP': {'like': {'L': 'luck'}}}
|
||||
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags)
|
||||
assert doc[1].tag_ == 'VBP'
|
||||
assert doc[1].lemma_ == 'luck'
|
||||
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_tag_names(EN):
|
||||
text = "I ate pizzas with anchovies."
|
||||
doc = EN(text, parse=False, tag=True)
|
||||
assert type(doc[2].pos) == int
|
||||
assert isinstance(doc[2].pos_, six.text_type)
|
||||
assert type(doc[2].dep) == int
|
||||
assert isinstance(doc[2].dep_, six.text_type)
|
||||
assert doc[2].tag_ == u'NNS'
|
||||
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_en_tagger_spaces(EN):
|
||||
"""Ensure spaces are assigned the POS tag SPACE"""
|
||||
text = "Some\nspaces are\tnecessary."
|
||||
doc = EN(text, tag=True, parse=False)
|
||||
assert doc[0].pos != SPACE
|
||||
assert doc[0].pos_ != 'SPACE'
|
||||
assert doc[1].pos == SPACE
|
||||
assert doc[1].pos_ == 'SPACE'
|
||||
assert doc[1].tag_ == 'SP'
|
||||
assert doc[2].pos != SPACE
|
||||
assert doc[3].pos != SPACE
|
||||
assert doc[4].pos == SPACE
|
||||
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_en_tagger_return_char(EN):
|
||||
"""Ensure spaces are assigned the POS tag SPACE"""
|
||||
text = ('hi Aaron,\r\n\r\nHow is your schedule today, I was wondering if '
|
||||
'you had time for a phone\r\ncall this afternoon?\r\n\r\n\r\n')
|
||||
tokens = EN(text)
|
||||
for token in tokens:
|
||||
if token.is_space:
|
||||
assert token.pos == SPACE
|
||||
assert tokens[3].text == '\r\n\r\n'
|
||||
assert tokens[3].is_space
|
||||
assert tokens[3].pos == SPACE
|
|
@ -7,7 +7,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
def test_tokenizer_handles_long_text(en_tokenizer):
|
||||
def test_en_tokenizer_handles_long_text(en_tokenizer):
|
||||
text = """Tributes pour in for late British Labour Party leader
|
||||
|
||||
Tributes poured in from around the world Thursday
|
||||
|
@ -30,7 +30,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
|||
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
||||
("They ran about 10km.", 6),
|
||||
pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))])
|
||||
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
|
||||
def test_en_tokenizer_handles_cnts(en_tokenizer, text, length):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
||||
|
|
|
@ -1,37 +1,33 @@
|
|||
# coding: utf-8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('fr')
|
||||
def test_lemmatizer_verb(FR):
|
||||
text = "Qu'est-ce que tu fais?"
|
||||
tokens = FR(text)
|
||||
tokens = FR("Qu'est-ce que tu fais?")
|
||||
assert tokens[0].lemma_ == "que"
|
||||
assert tokens[1].lemma_ == "être"
|
||||
assert tokens[5].lemma_ == "faire"
|
||||
|
||||
@pytest.mark.models
|
||||
|
||||
@pytest.mark.models('fr')
|
||||
@pytest.mark.xfail(reason="sont tagged as AUX")
|
||||
def test_lemmatizer_noun_verb_2(FR):
|
||||
text = "Les abaissements de température sont gênants."
|
||||
tokens = FR(text)
|
||||
tokens = FR("Les abaissements de température sont gênants.")
|
||||
assert tokens[4].lemma_ == "être"
|
||||
|
||||
@pytest.mark.models
|
||||
|
||||
@pytest.mark.models('fr')
|
||||
@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN")
|
||||
def test_lemmatizer_noun(FR):
|
||||
text = "il y a des Costaricienne."
|
||||
tokens = FR(text)
|
||||
def test_lemmatizer_noun(model):
|
||||
tokens = FR("il y a des Costaricienne.")
|
||||
assert tokens[4].lemma_ == "Costaricain"
|
||||
|
||||
@pytest.mark.models
|
||||
|
||||
@pytest.mark.models('fr')
|
||||
def test_lemmatizer_noun_2(FR):
|
||||
text = "Les abaissements de température sont gênants."
|
||||
tokens = FR(text)
|
||||
tokens = FR("Les abaissements de température sont gênants.")
|
||||
assert tokens[1].lemma_ == "abaissement"
|
||||
assert tokens[5].lemma_ == "gênant"
|
||||
|
||||
|
||||
|
|
|
@ -1,60 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...tokens import Doc
|
||||
from ..util import get_doc, apply_transition_sequence
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["A test sentence"])
|
||||
@pytest.mark.parametrize('punct', ['.', '!', '?', ''])
|
||||
def test_parser_sbd_single_punct(en_tokenizer, text, punct):
|
||||
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
||||
tokens = en_tokenizer(text + punct)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||
assert len(doc) == 4 if punct else 3
|
||||
assert len(list(doc.sents)) == 1
|
||||
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_parser_sentence_breaks(en_tokenizer, en_parser):
|
||||
text = "This is a sentence . This is another one ."
|
||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
|
||||
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
|
||||
'attr', 'punct']
|
||||
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT',
|
||||
'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct']
|
||||
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
||||
apply_transition_sequence(en_parser, doc, transition)
|
||||
|
||||
assert len(list(doc.sents)) == 2
|
||||
for token in doc:
|
||||
assert token.dep != 0 or token.is_space
|
||||
assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
|
||||
|
||||
|
||||
# Currently, there's no way of setting the serializer data for the parser
|
||||
# without loading the models, so we can't remove the model dependency here yet.
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.models
|
||||
def test_parser_sbd_serialization_projective(EN):
|
||||
"""Test that before and after serialization, the sentence boundaries are
|
||||
the same."""
|
||||
|
||||
text = "I bought a couch from IKEA It wasn't very comfortable."
|
||||
transition = ['L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj',
|
||||
'B-ROOT', 'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod',
|
||||
'R-acomp', 'D', 'R-punct']
|
||||
|
||||
doc = EN.tokenizer(text)
|
||||
apply_transition_sequence(EN.parser, doc, transition)
|
||||
doc_serialized = Doc(EN.vocab).from_bytes(doc.to_bytes())
|
||||
assert doc.is_parsed == True
|
||||
assert doc_serialized.is_parsed == True
|
||||
assert doc.to_bytes() == doc_serialized.to_bytes()
|
||||
assert [s.text for s in doc.sents] == [s.text for s in doc_serialized.sents]
|
|
@ -1,17 +1,11 @@
|
|||
import pytest
|
||||
|
||||
from ...pipeline import NeuralDependencyParser
|
||||
from ...vocab import Vocab
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vocab():
|
||||
return Vocab()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser(vocab):
|
||||
parser = NeuralDependencyParser(vocab)
|
||||
def parser(en_vocab):
|
||||
parser = NeuralDependencyParser(en_vocab)
|
||||
parser.add_label('nsubj')
|
||||
parser.model, cfg = parser.Model(parser.moves.n_moves)
|
||||
parser.cfg.update(cfg)
|
||||
|
@ -19,8 +13,8 @@ def parser(vocab):
|
|||
|
||||
|
||||
@pytest.fixture
|
||||
def blank_parser(vocab):
|
||||
parser = NeuralDependencyParser(vocab)
|
||||
def blank_parser(en_vocab):
|
||||
parser = NeuralDependencyParser(en_vocab)
|
||||
return parser
|
||||
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('en')
|
||||
@pytest.mark.parametrize('text,i', [("Jane's got a new car", 1),
|
||||
("Jane thinks that's a nice car", 3)])
|
||||
def test_issue401(EN, text, i):
|
||||
|
|
|
@ -6,7 +6,7 @@ from ...matcher import Matcher
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('en')
|
||||
def test_issue429(EN):
|
||||
def merge_phrases(matcher, doc, i, matches):
|
||||
if i != len(matches) - 1:
|
||||
|
|
|
@ -6,7 +6,7 @@ from ..util import get_doc
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('en')
|
||||
def test_issue514(EN):
|
||||
"""Test serializing after adding entity"""
|
||||
text = ["This", "is", "a", "sentence", "about", "pasta", "."]
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('en')
|
||||
def test_issue54(EN):
|
||||
text = "Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1)."
|
||||
tokens = EN(text)
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('en')
|
||||
@pytest.mark.parametrize('text', ["He is the man", "he is the man"])
|
||||
def test_issue686(EN, text):
|
||||
"""Test that pronoun lemmas are assigned correctly."""
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('en')
|
||||
def test_issue693(EN):
|
||||
"""Test that doc.noun_chunks parses the complete sentence."""
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('en')
|
||||
def test_issue704(EN):
|
||||
"""Test that sentence boundaries are detected correctly."""
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('en')
|
||||
@pytest.mark.parametrize('text1,text2',
|
||||
[("You're happy", "You are happy"),
|
||||
("I'm happy", "I am happy"),
|
||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('en')
|
||||
@pytest.mark.parametrize('text', ["s..."])
|
||||
def test_issue719(EN, text):
|
||||
"""Test that the token 's' is not lemmatized into empty string."""
|
||||
|
|
|
@ -4,7 +4,7 @@ import pytest
|
|||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.models
|
||||
@pytest.mark.models('en')
|
||||
def test_issue758(EN):
|
||||
'''Test parser transition bug after label added.'''
|
||||
from ...matcher import merge_phrase
|
||||
|
|
|
@ -5,6 +5,8 @@ import pytest
|
|||
|
||||
|
||||
# Note: "chromosomes" worked previous the bug fix
|
||||
@pytest.mark.models('en')
|
||||
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
|
||||
def test_issue781(lemmatizer, word, lemmas):
|
||||
def test_issue781(EN, word, lemmas):
|
||||
lemmatizer = EN.Defaults.create_lemmatizer()
|
||||
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)
|
||||
|
|
|
@ -70,8 +70,8 @@ def temp_save_model(model):
|
|||
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_issue910(train_data, additional_entity_types):
|
||||
@pytest.mark.models('en')
|
||||
def test_issue910(EN, train_data, additional_entity_types):
|
||||
'''Test that adding entities and resuming training works passably OK.
|
||||
There are two issues here:
|
||||
|
||||
|
@ -79,8 +79,7 @@ def test_issue910(train_data, additional_entity_types):
|
|||
2) There's no way to set the learning rate for the weight update, so we
|
||||
end up out-of-scale, causing it to learn too fast.
|
||||
'''
|
||||
nlp = English()
|
||||
doc = nlp(u"I am looking for a restaurant in Berlin")
|
||||
doc = EN(u"I am looking for a restaurant in Berlin")
|
||||
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
|
||||
# Fine tune the ner model
|
||||
for entity_type in additional_entity_types:
|
||||
|
|
|
@ -1,18 +1,13 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from ... import load as load_spacy
|
||||
|
||||
@pytest.fixture
|
||||
def doc():
|
||||
nlp = load_spacy('en')
|
||||
return nlp('Does flight number three fifty-four require a connecting flight'
|
||||
' to get to Boston?')
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_issue955(doc):
|
||||
@pytest.mark.models('en')
|
||||
def test_issue955(EN, doc):
|
||||
'''Test that we don't have any nested noun chunks'''
|
||||
doc = EN('Does flight number three fifty-four require a connecting flight'
|
||||
' to get to Boston?')
|
||||
seen_tokens = set()
|
||||
for np in doc.noun_chunks:
|
||||
print(np.text, np.root.text, np.root.dep_, np.root.tag_)
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
|
||||
("aardwolf", ["aardwolf"]),
|
||||
("planets", ["planet"]),
|
||||
("ring", ["ring"]),
|
||||
("axes", ["axis", "axe", "ax"])])
|
||||
def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas):
|
||||
if lemmatizer is None:
|
||||
return None
|
||||
assert lemmatizer.noun(text) == set(lemmas)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.models
|
||||
def test_tagger_lemmatizer_base_forms(lemmatizer):
|
||||
if lemmatizer is None:
|
||||
return None
|
||||
assert lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
|
||||
assert lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_tagger_lemmatizer_base_form_verb(lemmatizer):
|
||||
if lemmatizer is None:
|
||||
return None
|
||||
assert lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_tagger_lemmatizer_punct(lemmatizer):
|
||||
if lemmatizer is None:
|
||||
return None
|
||||
assert lemmatizer.punct('“') == set(['"'])
|
||||
assert lemmatizer.punct('“') == set(['"'])
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_tagger_lemmatizer_lemma_assignment(EN):
|
||||
text = "Bananas in pyjamas are geese."
|
||||
doc = EN.tokenizer(text)
|
||||
assert all(t.lemma_ == '' for t in doc)
|
||||
EN.tagger(doc)
|
||||
assert all(t.lemma_ != '' for t in doc)
|
|
@ -1,17 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..util import get_doc
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tagger_load_morph_exc(en_tokenizer):
|
||||
text = "I like his style."
|
||||
tags = ['PRP', 'VBP', 'PRP$', 'NN', '.']
|
||||
morph_exc = {'VBP': {'like': {'L': 'luck'}}}
|
||||
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags)
|
||||
assert doc[1].tag_ == 'VBP'
|
||||
assert doc[1].lemma_ == 'luck'
|
|
@ -1,35 +0,0 @@
|
|||
# coding: utf-8
|
||||
"""Ensure spaces are assigned the POS tag SPACE"""
|
||||
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from ...parts_of_speech import SPACE
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_tagger_spaces(EN):
|
||||
text = "Some\nspaces are\tnecessary."
|
||||
doc = EN(text, tag=True, parse=False)
|
||||
assert doc[0].pos != SPACE
|
||||
assert doc[0].pos_ != 'SPACE'
|
||||
assert doc[1].pos == SPACE
|
||||
assert doc[1].pos_ == 'SPACE'
|
||||
assert doc[1].tag_ == 'SP'
|
||||
assert doc[2].pos != SPACE
|
||||
assert doc[3].pos != SPACE
|
||||
assert doc[4].pos == SPACE
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_tagger_return_char(EN):
|
||||
text = ('hi Aaron,\r\n\r\nHow is your schedule today, I was wondering if '
|
||||
'you had time for a phone\r\ncall this afternoon?\r\n\r\n\r\n')
|
||||
tokens = EN(text)
|
||||
for token in tokens:
|
||||
if token.is_space:
|
||||
assert token.pos == SPACE
|
||||
assert tokens[3].text == '\r\n\r\n'
|
||||
assert tokens[3].is_space
|
||||
assert tokens[3].pos == SPACE
|
|
@ -1,16 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import six
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_tag_names(EN):
|
||||
text = "I ate pizzas with anchovies."
|
||||
doc = EN(text, parse=False, tag=True)
|
||||
assert type(doc[2].pos) == int
|
||||
assert isinstance(doc[2].pos_, six.text_type)
|
||||
assert type(doc[2].dep) == int
|
||||
assert isinstance(doc[2].dep_, six.text_type)
|
||||
assert doc[2].tag_ == u'NNS'
|
|
@ -20,6 +20,7 @@ def test_util_ensure_path_succeeds(text):
|
|||
assert isinstance(path, Path)
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_simple_model_roundtrip_bytes():
|
||||
model = Maxout(5, 10, pieces=2)
|
||||
model.b += 1
|
||||
|
@ -29,6 +30,7 @@ def test_simple_model_roundtrip_bytes():
|
|||
assert model.b[0, 0] == 1
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_multi_model_roundtrip_bytes():
|
||||
model = chain(Maxout(5, 10, pieces=2), Maxout(2, 3))
|
||||
model._layers[0].b += 1
|
||||
|
@ -41,6 +43,7 @@ def test_multi_model_roundtrip_bytes():
|
|||
assert model._layers[1].b[0, 0] == 2
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_multi_model_load_missing_dims():
|
||||
model = chain(Maxout(5, 10, pieces=2), Maxout(2, 3))
|
||||
model._layers[0].b += 1
|
||||
|
@ -52,6 +55,7 @@ def test_multi_model_load_missing_dims():
|
|||
assert model2._layers[0].b[0, 0] == 1
|
||||
assert model2._layers[1].b[0, 0] == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('package', ['thinc'])
|
||||
def test_util_is_package(package):
|
||||
"""Test that an installed package via pip is recognised by util.is_package."""
|
||||
|
|
Loading…
Reference in New Issue
Block a user