diff --git a/spacy/__init__.py b/spacy/__init__.py index 9a1f8304e..068282b1a 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -7,6 +7,7 @@ from .deprecated import resolve_load_name from .about import __version__ from . import util + def load(name, **overrides): name = resolve_load_name(name, **overrides) return util.load_model(name, **overrides) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 6b577be62..b5a34cb2d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,19 +1,40 @@ # coding: utf-8 from __future__ import unicode_literals -from ..tokens import Doc -from ..strings import StringStore -from ..lemmatizer import Lemmatizer -from ..attrs import ORTH, TAG, HEAD, DEP -from .. import util - from io import StringIO, BytesIO from pathlib import Path import pytest +from .util import load_test_model +from ..tokens import Doc +from ..strings import StringStore +from .. import util + _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb', - 'nl', 'pl', 'pt', 'sv'] + 'nl', 'pl', 'pt', 'sv', 'xx'] +_models = {'en': ['en_core_web_sm', 'en_core_web_md'], + 'de': ['de_core_news_md'], + 'fr': ['fr_depvec_web_lg'], + 'xx': ['xx_ent_web_md']} + + +# only used for tests that require loading the models +# in all other cases, use specific instances + +@pytest.fixture(params=_models['en'], scope="session") +def EN(request): + return load_test_model(request.param) + + +@pytest.fixture(params=_models['de'], scope="session") +def DE(request): + return load_test_model(request.param) + + +@pytest.fixture(params=_models['fr'], scope="session") +def FR(request): + return load_test_model(request.param) @pytest.fixture(params=_languages) @@ -91,11 +112,6 @@ def en_entityrecognizer(): return util.get_lang_class('en').Defaults.create_entity() -@pytest.fixture -def lemmatizer(): - return util.get_lang_class('en').Defaults.create_lemmatizer() - - @pytest.fixture def text_file(): return StringIO() @@ -105,22 +121,6 @@ def text_file_b(): return BytesIO() -# only used for tests that require loading the models -# in all other cases, use specific instances -@pytest.fixture(scope="session") -def EN(): - return English() - - -@pytest.fixture(scope="session") -def DE(): - return German() - -@pytest.fixture(scope="session") -def FR(): - return French() - - def pytest_addoption(parser): parser.addoption("--models", action="store_true", help="include tests that require full models") @@ -129,8 +129,18 @@ def pytest_addoption(parser): parser.addoption("--slow", action="store_true", help="include slow tests") + for lang in _languages + ['all']: + parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang) + def pytest_runtest_setup(item): for opt in ['models', 'vectors', 'slow']: if opt in item.keywords and not item.config.getoption("--%s" % opt): pytest.skip("need --%s option to run" % opt) + + # Check if test is marked with models and has arguments set, i.e. specific + # language. If so, skip test if flag not set. + if item.get_marker('models'): + for arg in item.get_marker('models').args: + if not item.config.getoption("--%s" % arg) and not item.config.getoption("--all"): + pytest.skip() diff --git a/spacy/tests/lang/de/test_parser.py b/spacy/tests/lang/de/test_parser.py new file mode 100644 index 000000000..6b5b25901 --- /dev/null +++ b/spacy/tests/lang/de/test_parser.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...util import get_doc + +import pytest + + +def test_de_parser_noun_chunks_standard_de(de_tokenizer): + text = "Eine Tasse steht auf dem Tisch." + heads = [1, 1, 0, -1, 1, -2, -4] + tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.'] + deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct'] + + tokens = de_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 2 + assert chunks[0].text_with_ws == "Eine Tasse " + assert chunks[1].text_with_ws == "dem Tisch " + + +def test_de_extended_chunk(de_tokenizer): + text = "Die Sängerin singt mit einer Tasse Kaffee Arien." + heads = [1, 1, 0, -1, 1, -2, -1, -5, -6] + tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.'] + deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct'] + + tokens = de_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) + chunks = list(doc.noun_chunks) + assert len(chunks) == 3 + assert chunks[0].text_with_ws == "Die Sängerin " + assert chunks[1].text_with_ws == "einer Tasse Kaffee " + assert chunks[2].text_with_ws == "Arien " diff --git a/spacy/tests/lang/en/test_contractions.py b/spacy/tests/lang/en/test_contractions.py deleted file mode 100644 index a97b8f5ba..000000000 --- a/spacy/tests/lang/en/test_contractions.py +++ /dev/null @@ -1,87 +0,0 @@ -# coding: utf-8 -"""Test that tokens are created correctly for contractions.""" - - -from __future__ import unicode_literals - -import pytest - - -def test_tokenizer_handles_basic_contraction(en_tokenizer): - text = "don't giggle" - tokens = en_tokenizer(text) - assert len(tokens) == 3 - assert tokens[1].text == "n't" - text = "i said don't!" - tokens = en_tokenizer(text) - assert len(tokens) == 5 - assert tokens[4].text == "!" - - -@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"]) -def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 3 - - -@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")]) -def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text): - tokens = en_tokenizer(text_poss) - assert len(tokens) == 2 - assert tokens[0].text == text - assert tokens[1].text == "'s" - - -@pytest.mark.parametrize('text', ["schools'", "Alexis'"]) -def test_tokenizer_splits_trailing_apos(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 2 - assert tokens[0].text == text.split("'")[0] - assert tokens[1].text == "'" - - -@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"]) -def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 1 - assert tokens[0].text == text - - -@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"]) -def test_tokenizer_handles_ll_contraction(en_tokenizer, text): - tokens = en_tokenizer(text) - assert len(tokens) == 2 - assert tokens[0].text == text.split("'")[0] - assert tokens[1].text == "'ll" - assert tokens[1].lemma_ == "will" - - -@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")]) -def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title): - tokens_lower = en_tokenizer(text_lower) - tokens_title = en_tokenizer(text_title) - assert tokens_title[0].text == tokens_lower[0].text.title() - assert tokens_lower[0].text == tokens_title[0].text.lower() - assert tokens_lower[1].text == tokens_title[1].text - - -@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"]) -@pytest.mark.parametrize('contraction', ["'ll", "'d"]) -def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction): - tokens = en_tokenizer(pron + contraction) - assert tokens[0].text == pron - assert tokens[1].text == contraction - - -@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"]) -def test_tokenizer_excludes_ambiguous(en_tokenizer, exc): - tokens = en_tokenizer(exc) - assert len(tokens) == 1 - - -@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")]) -def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct): - tokens = en_tokenizer(wo_punct) - assert len(tokens) == 2 - tokens = en_tokenizer(w_punct) - assert len(tokens) == 3 diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index 03e738a34..a49c0c421 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -1,19 +1,96 @@ # coding: utf-8 -"""Test that tokenizer exceptions are handled correctly.""" - - from __future__ import unicode_literals import pytest +def test_en_tokenizer_handles_basic_contraction(en_tokenizer): + text = "don't giggle" + tokens = en_tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == "n't" + text = "i said don't!" + tokens = en_tokenizer(text) + assert len(tokens) == 5 + assert tokens[4].text == "!" + + +@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"]) +def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")]) +def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text): + tokens = en_tokenizer(text_poss) + assert len(tokens) == 2 + assert tokens[0].text == text + assert tokens[1].text == "'s" + + +@pytest.mark.parametrize('text', ["schools'", "Alexis'"]) +def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == text.split("'")[0] + assert tokens[1].text == "'" + + +@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"]) +def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].text == text + + +@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"]) +def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text): + tokens = en_tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text == text.split("'")[0] + assert tokens[1].text == "'ll" + assert tokens[1].lemma_ == "will" + + +@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")]) +def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title): + tokens_lower = en_tokenizer(text_lower) + tokens_title = en_tokenizer(text_title) + assert tokens_title[0].text == tokens_lower[0].text.title() + assert tokens_lower[0].text == tokens_title[0].text.lower() + assert tokens_lower[1].text == tokens_title[1].text + + +@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"]) +@pytest.mark.parametrize('contraction', ["'ll", "'d"]) +def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction): + tokens = en_tokenizer(pron + contraction) + assert tokens[0].text == pron + assert tokens[1].text == contraction + + +@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"]) +def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc): + tokens = en_tokenizer(exc) + assert len(tokens) == 1 + + +@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")]) +def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct): + tokens = en_tokenizer(wo_punct) + assert len(tokens) == 2 + tokens = en_tokenizer(w_punct) + assert len(tokens) == 3 + + @pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."]) -def test_tokenizer_handles_abbr(en_tokenizer, text): +def test_en_tokenizer_handles_abbr(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 1 -def test_tokenizer_handles_exc_in_text(en_tokenizer): +def test_en_tokenizer_handles_exc_in_text(en_tokenizer): text = "It's mediocre i.e. bad." tokens = en_tokenizer(text) assert len(tokens) == 6 @@ -21,7 +98,7 @@ def test_tokenizer_handles_exc_in_text(en_tokenizer): @pytest.mark.parametrize('text', ["1am", "12a.m.", "11p.m.", "4pm"]) -def test_tokenizer_handles_times(en_tokenizer, text): +def test_en_tokenizer_handles_times(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 assert tokens[1].lemma_ in ["a.m.", "p.m."] diff --git a/spacy/tests/lang/en/test_indices.py b/spacy/tests/lang/en/test_indices.py index 0ed6ca4dc..c8f4c4b61 100644 --- a/spacy/tests/lang/en/test_indices.py +++ b/spacy/tests/lang/en/test_indices.py @@ -7,7 +7,7 @@ from __future__ import unicode_literals import pytest -def test_simple_punct(en_tokenizer): +def test_en_simple_punct(en_tokenizer): text = "to walk, do foo" tokens = en_tokenizer(text) assert tokens[0].idx == 0 @@ -17,7 +17,7 @@ def test_simple_punct(en_tokenizer): assert tokens[4].idx == 12 -def test_complex_punct(en_tokenizer): +def test_en_complex_punct(en_tokenizer): text = "Tom (D., Ill.)!" tokens = en_tokenizer(text) assert tokens[0].idx == 0 diff --git a/spacy/tests/lang/en/test_lemmatizer.py b/spacy/tests/lang/en/test_lemmatizer.py new file mode 100644 index 000000000..ec69f6a6d --- /dev/null +++ b/spacy/tests/lang/en/test_lemmatizer.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.fixture +def en_lemmatizer(EN): + return EN.Defaults.create_lemmatizer() + + +@pytest.mark.models('en') +@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]), + ("aardwolf", ["aardwolf"]), + ("planets", ["planet"]), + ("ring", ["ring"]), + ("axes", ["axis", "axe", "ax"])]) +def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas): + assert en_lemmatizer.noun(text) == set(lemmas) + + +@pytest.mark.xfail +@pytest.mark.models('en') +def test_en_lemmatizer_base_forms(en_lemmatizer): + assert en_lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive']) + assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva']) + + +@pytest.mark.models +def test_en_lemmatizer_base_form_verb(en_lemmatizer): + assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see']) + + +@pytest.mark.models +def test_en_lemmatizer_punct(en_lemmatizer): + assert en_lemmatizer.punct('“') == set(['"']) + assert en_lemmatizer.punct('“') == set(['"']) + + +@pytest.mark.models('en') +def test_en_lemmatizer_lemma_assignment(EN): + text = "Bananas in pyjamas are geese." + doc = EN.tokenizer(text) + assert all(t.lemma_ == '' for t in doc) + EN.tagger(doc) + assert all(t.lemma_ != '' for t in doc) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/lang/en/test_ner.py similarity index 88% rename from spacy/tests/parser/test_ner.py rename to spacy/tests/lang/en/test_ner.py index 38a0900c4..34fbbc898 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/lang/en/test_ner.py @@ -5,8 +5,8 @@ from spacy.attrs import LOWER from spacy.matcher import Matcher -@pytest.mark.models -def test_simple_types(EN): +@pytest.mark.models('en') +def test_en_ner_simple_types(EN): tokens = EN(u'Mr. Best flew to New York on Saturday morning.') ents = list(tokens.ents) assert ents[0].start == 1 @@ -17,8 +17,8 @@ def test_simple_types(EN): assert ents[1].label_ == 'GPE' -@pytest.mark.models -def test_consistency_bug(EN): +@pytest.mark.models('en') +def test_en_ner_consistency_bug(EN): '''Test an arbitrary sequence-consistency bug encountered during speed test''' tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.') tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False) @@ -26,8 +26,8 @@ def test_consistency_bug(EN): EN.entity(tokens) -@pytest.mark.models -def test_unit_end_gazetteer(EN): +@pytest.mark.models('en') +def test_en_ner_unit_end_gazetteer(EN): '''Test a bug in the interaction between the NER model and the gazetteer''' matcher = Matcher(EN.vocab) matcher.add('MemberNames', None, [{LOWER: 'cal'}], [{LOWER: 'cal'}, {LOWER: 'henderson'}]) @@ -38,6 +38,3 @@ def test_unit_end_gazetteer(EN): doc.ents += tuple(ents) EN.entity(doc) assert list(doc.ents)[0].text == 'cal' - - - diff --git a/spacy/tests/parser/test_noun_chunks.py b/spacy/tests/lang/en/test_parser.py similarity index 59% rename from spacy/tests/parser/test_noun_chunks.py rename to spacy/tests/lang/en/test_parser.py index 5e8c7659a..39d0fce61 100644 --- a/spacy/tests/parser/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_parser.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from ..util import get_doc +from ...util import get_doc import pytest @@ -45,32 +45,3 @@ def test_parser_noun_chunks_pp_chunks(en_tokenizer): assert len(chunks) == 2 assert chunks[0].text_with_ws == "A phrase " assert chunks[1].text_with_ws == "another phrase " - - -def test_parser_noun_chunks_standard_de(de_tokenizer): - text = "Eine Tasse steht auf dem Tisch." - heads = [1, 1, 0, -1, 1, -2, -4] - tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.'] - deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct'] - - tokens = de_tokenizer(text) - doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) - chunks = list(doc.noun_chunks) - assert len(chunks) == 2 - assert chunks[0].text_with_ws == "Eine Tasse " - assert chunks[1].text_with_ws == "dem Tisch " - - -def test_de_extended_chunk(de_tokenizer): - text = "Die Sängerin singt mit einer Tasse Kaffee Arien." - heads = [1, 1, 0, -1, 1, -2, -1, -5, -6] - tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.'] - deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct'] - - tokens = de_tokenizer(text) - doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads) - chunks = list(doc.noun_chunks) - assert len(chunks) == 3 - assert chunks[0].text_with_ws == "Die Sängerin " - assert chunks[1].text_with_ws == "einer Tasse Kaffee " - assert chunks[2].text_with_ws == "Arien " diff --git a/spacy/tests/lang/en/test_punct.py b/spacy/tests/lang/en/test_punct.py index d7d5592f4..750008603 100644 --- a/spacy/tests/lang/en/test_punct.py +++ b/spacy/tests/lang/en/test_punct.py @@ -16,14 +16,14 @@ PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] @pytest.mark.parametrize('text', ["(", "((", "<"]) -def test_tokenizer_handles_only_punct(en_tokenizer, text): +def test_en_tokenizer_handles_only_punct(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == len(text) @pytest.mark.parametrize('punct', PUNCT_OPEN) @pytest.mark.parametrize('text', ["Hello"]) -def test_tokenizer_splits_open_punct(en_tokenizer, punct, text): +def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text): tokens = en_tokenizer(punct + text) assert len(tokens) == 2 assert tokens[0].text == punct @@ -32,7 +32,7 @@ def test_tokenizer_splits_open_punct(en_tokenizer, punct, text): @pytest.mark.parametrize('punct', PUNCT_CLOSE) @pytest.mark.parametrize('text', ["Hello"]) -def test_tokenizer_splits_close_punct(en_tokenizer, punct, text): +def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text): tokens = en_tokenizer(text + punct) assert len(tokens) == 2 assert tokens[0].text == text @@ -42,7 +42,7 @@ def test_tokenizer_splits_close_punct(en_tokenizer, punct, text): @pytest.mark.parametrize('punct', PUNCT_OPEN) @pytest.mark.parametrize('punct_add', ["`"]) @pytest.mark.parametrize('text', ["Hello"]) -def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text): +def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text): tokens = en_tokenizer(punct + punct_add + text) assert len(tokens) == 3 assert tokens[0].text == punct @@ -53,7 +53,7 @@ def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, te @pytest.mark.parametrize('punct', PUNCT_CLOSE) @pytest.mark.parametrize('punct_add', ["'"]) @pytest.mark.parametrize('text', ["Hello"]) -def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text): +def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text): tokens = en_tokenizer(text + punct + punct_add) assert len(tokens) == 3 assert tokens[0].text == text @@ -63,7 +63,7 @@ def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, t @pytest.mark.parametrize('punct', PUNCT_OPEN) @pytest.mark.parametrize('text', ["Hello"]) -def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text): +def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text): tokens = en_tokenizer(punct + punct + punct + text) assert len(tokens) == 4 assert tokens[0].text == punct @@ -72,7 +72,7 @@ def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text): @pytest.mark.parametrize('punct', PUNCT_CLOSE) @pytest.mark.parametrize('text', ["Hello"]) -def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text): +def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text): tokens = en_tokenizer(text + punct + punct + punct) assert len(tokens) == 4 assert tokens[0].text == text @@ -80,14 +80,14 @@ def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text): @pytest.mark.parametrize('text', ["'The"]) -def test_tokenizer_splits_open_appostrophe(en_tokenizer, text): +def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 assert tokens[0].text == "'" @pytest.mark.parametrize('text', ["Hello''"]) -def test_tokenizer_splits_double_end_quote(en_tokenizer, text): +def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 tokens_punct = en_tokenizer("''") @@ -96,7 +96,7 @@ def test_tokenizer_splits_double_end_quote(en_tokenizer, text): @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) @pytest.mark.parametrize('text', ["Hello"]) -def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, +def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, punct_close, text): tokens = en_tokenizer(punct_open + text + punct_close) assert len(tokens) == 3 @@ -108,7 +108,7 @@ def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, @pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) @pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")]) @pytest.mark.parametrize('text', ["Hello"]) -def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close, +def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text): tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) assert len(tokens) == 5 @@ -120,13 +120,13 @@ def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close, @pytest.mark.parametrize('text,punct', [("(can't", "(")]) -def test_tokenizer_splits_pre_punct_regex(text, punct): +def test_en_tokenizer_splits_pre_punct_regex(text, punct): en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search match = en_search_prefixes(text) assert match.group() == punct -def test_tokenizer_splits_bracket_period(en_tokenizer): +def test_en_tokenizer_splits_bracket_period(en_tokenizer): text = "(And a 6a.m. run through Washington Park)." tokens = en_tokenizer(text) assert tokens[len(tokens) - 1].text == "." diff --git a/spacy/tests/parser/test_sbd_prag.py b/spacy/tests/lang/en/test_sbd.py similarity index 75% rename from spacy/tests/parser/test_sbd_prag.py rename to spacy/tests/lang/en/test_sbd.py index ba5571224..2278f657e 100644 --- a/spacy/tests/parser/test_sbd_prag.py +++ b/spacy/tests/lang/en/test_sbd.py @@ -1,9 +1,65 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +from ....tokens import Doc +from ...util import get_doc, apply_transition_sequence + import pytest +@pytest.mark.parametrize('text', ["A test sentence"]) +@pytest.mark.parametrize('punct', ['.', '!', '?', '']) +def test_en_sbd_single_punct(en_tokenizer, text, punct): + heads = [2, 1, 0, -1] if punct else [2, 1, 0] + tokens = en_tokenizer(text + punct) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) + assert len(doc) == 4 if punct else 3 + assert len(list(doc.sents)) == 1 + assert sum(len(sent) for sent in doc.sents) == len(doc) + + +@pytest.mark.xfail +def test_en_sentence_breaks(en_tokenizer, en_parser): + text = "This is a sentence . This is another one ." + heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3] + deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det', + 'attr', 'punct'] + transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT', + 'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) + apply_transition_sequence(en_parser, doc, transition) + + assert len(list(doc.sents)) == 2 + for token in doc: + assert token.dep != 0 or token.is_space + assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6] + + +# Currently, there's no way of setting the serializer data for the parser +# without loading the models, so we can't remove the model dependency here yet. + +@pytest.mark.xfail +@pytest.mark.models('en') +def test_en_sbd_serialization_projective(EN): + """Test that before and after serialization, the sentence boundaries are + the same.""" + + text = "I bought a couch from IKEA It wasn't very comfortable." + transition = ['L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj', + 'B-ROOT', 'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod', + 'R-acomp', 'D', 'R-punct'] + + doc = EN.tokenizer(text) + apply_transition_sequence(EN.parser, doc, transition) + doc_serialized = Doc(EN.vocab).from_bytes(doc.to_bytes()) + assert doc.is_parsed == True + assert doc_serialized.is_parsed == True + assert doc.to_bytes() == doc_serialized.to_bytes() + assert [s.text for s in doc.sents] == [s.text for s in doc_serialized.sents] + + TEST_CASES = [ ("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]), ("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]), @@ -59,10 +115,9 @@ TEST_CASES = [ pytest.mark.xfail(("Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.", ["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."])) ] -@pytest.mark.slow -@pytest.mark.models +@pytest.mark.models('en') @pytest.mark.parametrize('text,expected_sents', TEST_CASES) -def test_parser_sbd_prag(EN, text, expected_sents): +def test_en_sbd_prag(EN, text, expected_sents): """SBD tests from Pragmatic Segmenter""" doc = EN(text) sents = [] diff --git a/spacy/tests/lang/en/test_tagger.py b/spacy/tests/lang/en/test_tagger.py new file mode 100644 index 000000000..859c40b39 --- /dev/null +++ b/spacy/tests/lang/en/test_tagger.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ....parts_of_speech import SPACE +from ...util import get_doc + +import six +import pytest + + +def test_en_tagger_load_morph_exc(en_tokenizer): + text = "I like his style." + tags = ['PRP', 'VBP', 'PRP$', 'NN', '.'] + morph_exc = {'VBP': {'like': {'L': 'luck'}}} + en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc) + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags) + assert doc[1].tag_ == 'VBP' + assert doc[1].lemma_ == 'luck' + + +@pytest.mark.models('en') +def test_tag_names(EN): + text = "I ate pizzas with anchovies." + doc = EN(text, parse=False, tag=True) + assert type(doc[2].pos) == int + assert isinstance(doc[2].pos_, six.text_type) + assert type(doc[2].dep) == int + assert isinstance(doc[2].dep_, six.text_type) + assert doc[2].tag_ == u'NNS' + + +@pytest.mark.models('en') +def test_en_tagger_spaces(EN): + """Ensure spaces are assigned the POS tag SPACE""" + text = "Some\nspaces are\tnecessary." + doc = EN(text, tag=True, parse=False) + assert doc[0].pos != SPACE + assert doc[0].pos_ != 'SPACE' + assert doc[1].pos == SPACE + assert doc[1].pos_ == 'SPACE' + assert doc[1].tag_ == 'SP' + assert doc[2].pos != SPACE + assert doc[3].pos != SPACE + assert doc[4].pos == SPACE + + +@pytest.mark.models('en') +def test_en_tagger_return_char(EN): + """Ensure spaces are assigned the POS tag SPACE""" + text = ('hi Aaron,\r\n\r\nHow is your schedule today, I was wondering if ' + 'you had time for a phone\r\ncall this afternoon?\r\n\r\n\r\n') + tokens = EN(text) + for token in tokens: + if token.is_space: + assert token.pos == SPACE + assert tokens[3].text == '\r\n\r\n' + assert tokens[3].is_space + assert tokens[3].pos == SPACE diff --git a/spacy/tests/lang/en/test_text.py b/spacy/tests/lang/en/test_text.py index 2061a47e3..a2ffaf7ea 100644 --- a/spacy/tests/lang/en/test_text.py +++ b/spacy/tests/lang/en/test_text.py @@ -7,7 +7,7 @@ from __future__ import unicode_literals import pytest -def test_tokenizer_handles_long_text(en_tokenizer): +def test_en_tokenizer_handles_long_text(en_tokenizer): text = """Tributes pour in for late British Labour Party leader Tributes poured in from around the world Thursday @@ -30,7 +30,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. ("""'Me too!', Mr. P. Delaware cried. """, 11), ("They ran about 10km.", 6), pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))]) -def test_tokenizer_handles_cnts(en_tokenizer, text, length): +def test_en_tokenizer_handles_cnts(en_tokenizer, text, length): tokens = en_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/lang/fr/test_lemmatization.py b/spacy/tests/lang/fr/test_lemmatization.py index c009e72c0..bcd8d4600 100644 --- a/spacy/tests/lang/fr/test_lemmatization.py +++ b/spacy/tests/lang/fr/test_lemmatization.py @@ -1,37 +1,33 @@ # coding: utf-8 - from __future__ import unicode_literals import pytest -@pytest.mark.models +@pytest.mark.models('fr') def test_lemmatizer_verb(FR): - text = "Qu'est-ce que tu fais?" - tokens = FR(text) + tokens = FR("Qu'est-ce que tu fais?") assert tokens[0].lemma_ == "que" assert tokens[1].lemma_ == "être" assert tokens[5].lemma_ == "faire" -@pytest.mark.models + +@pytest.mark.models('fr') @pytest.mark.xfail(reason="sont tagged as AUX") def test_lemmatizer_noun_verb_2(FR): - text = "Les abaissements de température sont gênants." - tokens = FR(text) + tokens = FR("Les abaissements de température sont gênants.") assert tokens[4].lemma_ == "être" -@pytest.mark.models + +@pytest.mark.models('fr') @pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN") -def test_lemmatizer_noun(FR): - text = "il y a des Costaricienne." - tokens = FR(text) +def test_lemmatizer_noun(model): + tokens = FR("il y a des Costaricienne.") assert tokens[4].lemma_ == "Costaricain" -@pytest.mark.models + +@pytest.mark.models('fr') def test_lemmatizer_noun_2(FR): - text = "Les abaissements de température sont gênants." - tokens = FR(text) + tokens = FR("Les abaissements de température sont gênants.") assert tokens[1].lemma_ == "abaissement" assert tokens[5].lemma_ == "gênant" - - diff --git a/spacy/tests/parser/test_sbd.py b/spacy/tests/parser/test_sbd.py deleted file mode 100644 index 4fa20c900..000000000 --- a/spacy/tests/parser/test_sbd.py +++ /dev/null @@ -1,60 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from ...tokens import Doc -from ..util import get_doc, apply_transition_sequence - -import pytest - - -@pytest.mark.parametrize('text', ["A test sentence"]) -@pytest.mark.parametrize('punct', ['.', '!', '?', '']) -def test_parser_sbd_single_punct(en_tokenizer, text, punct): - heads = [2, 1, 0, -1] if punct else [2, 1, 0] - tokens = en_tokenizer(text + punct) - doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) - assert len(doc) == 4 if punct else 3 - assert len(list(doc.sents)) == 1 - assert sum(len(sent) for sent in doc.sents) == len(doc) - - -@pytest.mark.xfail -def test_parser_sentence_breaks(en_tokenizer, en_parser): - text = "This is a sentence . This is another one ." - heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3] - deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det', - 'attr', 'punct'] - transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT', - 'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct'] - - tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) - apply_transition_sequence(en_parser, doc, transition) - - assert len(list(doc.sents)) == 2 - for token in doc: - assert token.dep != 0 or token.is_space - assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6] - - -# Currently, there's no way of setting the serializer data for the parser -# without loading the models, so we can't remove the model dependency here yet. - -@pytest.mark.xfail -@pytest.mark.models -def test_parser_sbd_serialization_projective(EN): - """Test that before and after serialization, the sentence boundaries are - the same.""" - - text = "I bought a couch from IKEA It wasn't very comfortable." - transition = ['L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj', - 'B-ROOT', 'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod', - 'R-acomp', 'D', 'R-punct'] - - doc = EN.tokenizer(text) - apply_transition_sequence(EN.parser, doc, transition) - doc_serialized = Doc(EN.vocab).from_bytes(doc.to_bytes()) - assert doc.is_parsed == True - assert doc_serialized.is_parsed == True - assert doc.to_bytes() == doc_serialized.to_bytes() - assert [s.text for s in doc.sents] == [s.text for s in doc_serialized.sents] diff --git a/spacy/tests/parser/test_to_from_bytes_disk.py b/spacy/tests/parser/test_to_from_bytes_disk.py index be536d679..b0a10fa8e 100644 --- a/spacy/tests/parser/test_to_from_bytes_disk.py +++ b/spacy/tests/parser/test_to_from_bytes_disk.py @@ -1,17 +1,11 @@ import pytest from ...pipeline import NeuralDependencyParser -from ...vocab import Vocab @pytest.fixture -def vocab(): - return Vocab() - - -@pytest.fixture -def parser(vocab): - parser = NeuralDependencyParser(vocab) +def parser(en_vocab): + parser = NeuralDependencyParser(en_vocab) parser.add_label('nsubj') parser.model, cfg = parser.Model(parser.moves.n_moves) parser.cfg.update(cfg) @@ -19,8 +13,8 @@ def parser(vocab): @pytest.fixture -def blank_parser(vocab): - parser = NeuralDependencyParser(vocab) +def blank_parser(en_vocab): + parser = NeuralDependencyParser(en_vocab) return parser diff --git a/spacy/tests/regression/test_issue401.py b/spacy/tests/regression/test_issue401.py index 9d862cc65..e5b72d472 100644 --- a/spacy/tests/regression/test_issue401.py +++ b/spacy/tests/regression/test_issue401.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest -@pytest.mark.models +@pytest.mark.models('en') @pytest.mark.parametrize('text,i', [("Jane's got a new car", 1), ("Jane thinks that's a nice car", 3)]) def test_issue401(EN, text, i): diff --git a/spacy/tests/regression/test_issue429.py b/spacy/tests/regression/test_issue429.py index 53d4dfc4d..df8d6d3fc 100644 --- a/spacy/tests/regression/test_issue429.py +++ b/spacy/tests/regression/test_issue429.py @@ -6,7 +6,7 @@ from ...matcher import Matcher import pytest -@pytest.mark.models +@pytest.mark.models('en') def test_issue429(EN): def merge_phrases(matcher, doc, i, matches): if i != len(matches) - 1: diff --git a/spacy/tests/regression/test_issue514.py b/spacy/tests/regression/test_issue514.py index a21b7333e..c03fab60b 100644 --- a/spacy/tests/regression/test_issue514.py +++ b/spacy/tests/regression/test_issue514.py @@ -6,7 +6,7 @@ from ..util import get_doc import pytest -@pytest.mark.models +@pytest.mark.models('en') def test_issue514(EN): """Test serializing after adding entity""" text = ["This", "is", "a", "sentence", "about", "pasta", "."] diff --git a/spacy/tests/regression/test_issue54.py b/spacy/tests/regression/test_issue54.py index 9085457f6..9867a4989 100644 --- a/spacy/tests/regression/test_issue54.py +++ b/spacy/tests/regression/test_issue54.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest -@pytest.mark.models +@pytest.mark.models('en') def test_issue54(EN): text = "Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1)." tokens = EN(text) diff --git a/spacy/tests/regression/test_issue686.py b/spacy/tests/regression/test_issue686.py index d3807808a..1323393db 100644 --- a/spacy/tests/regression/test_issue686.py +++ b/spacy/tests/regression/test_issue686.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest -@pytest.mark.models +@pytest.mark.models('en') @pytest.mark.parametrize('text', ["He is the man", "he is the man"]) def test_issue686(EN, text): """Test that pronoun lemmas are assigned correctly.""" diff --git a/spacy/tests/regression/test_issue693.py b/spacy/tests/regression/test_issue693.py index e4d907716..0cee46b9b 100644 --- a/spacy/tests/regression/test_issue693.py +++ b/spacy/tests/regression/test_issue693.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest -@pytest.mark.models +@pytest.mark.models('en') def test_issue693(EN): """Test that doc.noun_chunks parses the complete sentence.""" diff --git a/spacy/tests/regression/test_issue704.py b/spacy/tests/regression/test_issue704.py index 2cecf6219..51abead86 100644 --- a/spacy/tests/regression/test_issue704.py +++ b/spacy/tests/regression/test_issue704.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest -@pytest.mark.models +@pytest.mark.models('en') def test_issue704(EN): """Test that sentence boundaries are detected correctly.""" diff --git a/spacy/tests/regression/test_issue717.py b/spacy/tests/regression/test_issue717.py index 1548c06aa..69c0705cb 100644 --- a/spacy/tests/regression/test_issue717.py +++ b/spacy/tests/regression/test_issue717.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest -@pytest.mark.models +@pytest.mark.models('en') @pytest.mark.parametrize('text1,text2', [("You're happy", "You are happy"), ("I'm happy", "I am happy"), diff --git a/spacy/tests/regression/test_issue719.py b/spacy/tests/regression/test_issue719.py index 62adbcd44..9b4838bdb 100644 --- a/spacy/tests/regression/test_issue719.py +++ b/spacy/tests/regression/test_issue719.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest -@pytest.mark.models +@pytest.mark.models('en') @pytest.mark.parametrize('text', ["s..."]) def test_issue719(EN, text): """Test that the token 's' is not lemmatized into empty string.""" diff --git a/spacy/tests/regression/test_issue758.py b/spacy/tests/regression/test_issue758.py index 0add70e2c..48e27be02 100644 --- a/spacy/tests/regression/test_issue758.py +++ b/spacy/tests/regression/test_issue758.py @@ -4,7 +4,7 @@ import pytest @pytest.mark.xfail -@pytest.mark.models +@pytest.mark.models('en') def test_issue758(EN): '''Test parser transition bug after label added.''' from ...matcher import merge_phrase diff --git a/spacy/tests/regression/test_issue781.py b/spacy/tests/regression/test_issue781.py index 1c48d1534..e3f391a37 100644 --- a/spacy/tests/regression/test_issue781.py +++ b/spacy/tests/regression/test_issue781.py @@ -5,6 +5,8 @@ import pytest # Note: "chromosomes" worked previous the bug fix +@pytest.mark.models('en') @pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])]) -def test_issue781(lemmatizer, word, lemmas): +def test_issue781(EN, word, lemmas): + lemmatizer = EN.Defaults.create_lemmatizer() assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas) diff --git a/spacy/tests/regression/test_issue910.py b/spacy/tests/regression/test_issue910.py index 4505b500e..cc6610e0d 100644 --- a/spacy/tests/regression/test_issue910.py +++ b/spacy/tests/regression/test_issue910.py @@ -70,8 +70,8 @@ def temp_save_model(model): -@pytest.mark.models -def test_issue910(train_data, additional_entity_types): +@pytest.mark.models('en') +def test_issue910(EN, train_data, additional_entity_types): '''Test that adding entities and resuming training works passably OK. There are two issues here: @@ -79,8 +79,7 @@ def test_issue910(train_data, additional_entity_types): 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. ''' - nlp = English() - doc = nlp(u"I am looking for a restaurant in Berlin") + doc = EN(u"I am looking for a restaurant in Berlin") ents_before_train = [(ent.label_, ent.text) for ent in doc.ents] # Fine tune the ner model for entity_type in additional_entity_types: diff --git a/spacy/tests/regression/test_issue995.py b/spacy/tests/regression/test_issue995.py index 633e96fb5..13a71336c 100644 --- a/spacy/tests/regression/test_issue995.py +++ b/spacy/tests/regression/test_issue995.py @@ -1,18 +1,13 @@ from __future__ import unicode_literals import pytest -from ... import load as load_spacy - -@pytest.fixture -def doc(): - nlp = load_spacy('en') - return nlp('Does flight number three fifty-four require a connecting flight' - ' to get to Boston?') -@pytest.mark.models -def test_issue955(doc): +@pytest.mark.models('en') +def test_issue955(EN, doc): '''Test that we don't have any nested noun chunks''' + doc = EN('Does flight number three fifty-four require a connecting flight' + ' to get to Boston?') seen_tokens = set() for np in doc.noun_chunks: print(np.text, np.root.text, np.root.dep_, np.root.tag_) diff --git a/spacy/tests/tagger/__init__.py b/spacy/tests/tagger/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py deleted file mode 100644 index 5db0d0b2c..000000000 --- a/spacy/tests/tagger/test_lemmatizer.py +++ /dev/null @@ -1,49 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest - - -@pytest.mark.models -@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]), - ("aardwolf", ["aardwolf"]), - ("planets", ["planet"]), - ("ring", ["ring"]), - ("axes", ["axis", "axe", "ax"])]) -def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas): - if lemmatizer is None: - return None - assert lemmatizer.noun(text) == set(lemmas) - - -@pytest.mark.xfail -@pytest.mark.models -def test_tagger_lemmatizer_base_forms(lemmatizer): - if lemmatizer is None: - return None - assert lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive']) - assert lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva']) - - -@pytest.mark.models -def test_tagger_lemmatizer_base_form_verb(lemmatizer): - if lemmatizer is None: - return None - assert lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see']) - - -@pytest.mark.models -def test_tagger_lemmatizer_punct(lemmatizer): - if lemmatizer is None: - return None - assert lemmatizer.punct('“') == set(['"']) - assert lemmatizer.punct('“') == set(['"']) - - -@pytest.mark.models -def test_tagger_lemmatizer_lemma_assignment(EN): - text = "Bananas in pyjamas are geese." - doc = EN.tokenizer(text) - assert all(t.lemma_ == '' for t in doc) - EN.tagger(doc) - assert all(t.lemma_ != '' for t in doc) diff --git a/spacy/tests/tagger/test_morph_exceptions.py b/spacy/tests/tagger/test_morph_exceptions.py deleted file mode 100644 index 63b0a9c15..000000000 --- a/spacy/tests/tagger/test_morph_exceptions.py +++ /dev/null @@ -1,17 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from ..util import get_doc - -import pytest - - -def test_tagger_load_morph_exc(en_tokenizer): - text = "I like his style." - tags = ['PRP', 'VBP', 'PRP$', 'NN', '.'] - morph_exc = {'VBP': {'like': {'L': 'luck'}}} - en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc) - tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags) - assert doc[1].tag_ == 'VBP' - assert doc[1].lemma_ == 'luck' diff --git a/spacy/tests/tagger/test_spaces.py b/spacy/tests/tagger/test_spaces.py deleted file mode 100644 index 5b12eba7f..000000000 --- a/spacy/tests/tagger/test_spaces.py +++ /dev/null @@ -1,35 +0,0 @@ -# coding: utf-8 -"""Ensure spaces are assigned the POS tag SPACE""" - - -from __future__ import unicode_literals -from ...parts_of_speech import SPACE - -import pytest - - -@pytest.mark.models -def test_tagger_spaces(EN): - text = "Some\nspaces are\tnecessary." - doc = EN(text, tag=True, parse=False) - assert doc[0].pos != SPACE - assert doc[0].pos_ != 'SPACE' - assert doc[1].pos == SPACE - assert doc[1].pos_ == 'SPACE' - assert doc[1].tag_ == 'SP' - assert doc[2].pos != SPACE - assert doc[3].pos != SPACE - assert doc[4].pos == SPACE - - -@pytest.mark.models -def test_tagger_return_char(EN): - text = ('hi Aaron,\r\n\r\nHow is your schedule today, I was wondering if ' - 'you had time for a phone\r\ncall this afternoon?\r\n\r\n\r\n') - tokens = EN(text) - for token in tokens: - if token.is_space: - assert token.pos == SPACE - assert tokens[3].text == '\r\n\r\n' - assert tokens[3].is_space - assert tokens[3].pos == SPACE diff --git a/spacy/tests/tagger/test_tag_names.py b/spacy/tests/tagger/test_tag_names.py deleted file mode 100644 index 9c5b0adcc..000000000 --- a/spacy/tests/tagger/test_tag_names.py +++ /dev/null @@ -1,16 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import six -import pytest - - -@pytest.mark.models -def test_tag_names(EN): - text = "I ate pizzas with anchovies." - doc = EN(text, parse=False, tag=True) - assert type(doc[2].pos) == int - assert isinstance(doc[2].pos_, six.text_type) - assert type(doc[2].dep) == int - assert isinstance(doc[2].dep_, six.text_type) - assert doc[2].tag_ == u'NNS' diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 2c1b2cefa..00ee1a93a 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -20,6 +20,7 @@ def test_util_ensure_path_succeeds(text): assert isinstance(path, Path) +@pytest.mark.models def test_simple_model_roundtrip_bytes(): model = Maxout(5, 10, pieces=2) model.b += 1 @@ -29,6 +30,7 @@ def test_simple_model_roundtrip_bytes(): assert model.b[0, 0] == 1 +@pytest.mark.models def test_multi_model_roundtrip_bytes(): model = chain(Maxout(5, 10, pieces=2), Maxout(2, 3)) model._layers[0].b += 1 @@ -41,6 +43,7 @@ def test_multi_model_roundtrip_bytes(): assert model._layers[1].b[0, 0] == 2 +@pytest.mark.models def test_multi_model_load_missing_dims(): model = chain(Maxout(5, 10, pieces=2), Maxout(2, 3)) model._layers[0].b += 1 diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 9f7300c7e..385ff414b 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -4,9 +4,20 @@ from __future__ import unicode_literals from ..tokens import Doc from ..attrs import ORTH, POS, HEAD, DEP +import pytest import numpy +MODELS = {} + + +def load_test_model(model): + if model not in MODELS: + module = pytest.importorskip(model) + MODELS[model] = module.load() + return MODELS[model] + + def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): """Create Doc object from given vocab, words and annotations.""" pos = pos or [''] * len(words) diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index f3ce0ad83..052942672 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -2,9 +2,16 @@ include ../../_includes/_mixins ++aside("Help us improve the docs") + | Did you spot a mistake or come across explanations that + | are unclear? We always appreciate improvement + | #[+a(gh("spaCy") + "/issues") suggestions] or + | #[+a(gh("spaCy") + "/pulls") pull requests]. You can find a "Suggest + | edits" link at the bottom of each page that points you to the source. + +h(2, "whats-spacy") What's spaCy? -+grid ++grid.o-no-block +grid-col("half") +grid-col("half") @@ -52,8 +59,8 @@ p +row +cell #[strong Dependency Parsing] +cell - | Assigning syntactic dependency labels, i.e. the relations between - | individual tokens. + | Assigning syntactic dependency labels, describing the relations + | between individual tokens, like subject or object. +cell #[+procon("pro")] +row @@ -374,6 +381,8 @@ p | on GitHub, which we use to tag bugs and feature requests that are easy | and self-contained. We also appreciate contributions to the docs – whether | it's fixing a typo, improving an example or adding additional explanations. + | You'll find a "Suggest edits" link at the bottom of each page that points + | you to the source. p | Another way of getting involved is to help us improve the