mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Fix test_misc merge conflict
This commit is contained in:
commit
b127645afc
|
@ -7,6 +7,7 @@ from .deprecated import resolve_load_name
|
||||||
from .about import __version__
|
from .about import __version__
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
def load(name, **overrides):
|
def load(name, **overrides):
|
||||||
name = resolve_load_name(name, **overrides)
|
name = resolve_load_name(name, **overrides)
|
||||||
return util.load_model(name, **overrides)
|
return util.load_model(name, **overrides)
|
||||||
|
|
|
@ -1,19 +1,40 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..tokens import Doc
|
|
||||||
from ..strings import StringStore
|
|
||||||
from ..lemmatizer import Lemmatizer
|
|
||||||
from ..attrs import ORTH, TAG, HEAD, DEP
|
|
||||||
from .. import util
|
|
||||||
|
|
||||||
from io import StringIO, BytesIO
|
from io import StringIO, BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from .util import load_test_model
|
||||||
|
from ..tokens import Doc
|
||||||
|
from ..strings import StringStore
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
|
_languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'he', 'hu', 'it', 'nb',
|
||||||
'nl', 'pl', 'pt', 'sv']
|
'nl', 'pl', 'pt', 'sv', 'xx']
|
||||||
|
_models = {'en': ['en_core_web_sm', 'en_core_web_md'],
|
||||||
|
'de': ['de_core_news_md'],
|
||||||
|
'fr': ['fr_depvec_web_lg'],
|
||||||
|
'xx': ['xx_ent_web_md']}
|
||||||
|
|
||||||
|
|
||||||
|
# only used for tests that require loading the models
|
||||||
|
# in all other cases, use specific instances
|
||||||
|
|
||||||
|
@pytest.fixture(params=_models['en'], scope="session")
|
||||||
|
def EN(request):
|
||||||
|
return load_test_model(request.param)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(params=_models['de'], scope="session")
|
||||||
|
def DE(request):
|
||||||
|
return load_test_model(request.param)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(params=_models['fr'], scope="session")
|
||||||
|
def FR(request):
|
||||||
|
return load_test_model(request.param)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(params=_languages)
|
@pytest.fixture(params=_languages)
|
||||||
|
@ -91,11 +112,6 @@ def en_entityrecognizer():
|
||||||
return util.get_lang_class('en').Defaults.create_entity()
|
return util.get_lang_class('en').Defaults.create_entity()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def lemmatizer():
|
|
||||||
return util.get_lang_class('en').Defaults.create_lemmatizer()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def text_file():
|
def text_file():
|
||||||
return StringIO()
|
return StringIO()
|
||||||
|
@ -105,22 +121,6 @@ def text_file_b():
|
||||||
return BytesIO()
|
return BytesIO()
|
||||||
|
|
||||||
|
|
||||||
# only used for tests that require loading the models
|
|
||||||
# in all other cases, use specific instances
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def EN():
|
|
||||||
return English()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def DE():
|
|
||||||
return German()
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
|
||||||
def FR():
|
|
||||||
return French()
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_addoption(parser):
|
def pytest_addoption(parser):
|
||||||
parser.addoption("--models", action="store_true",
|
parser.addoption("--models", action="store_true",
|
||||||
help="include tests that require full models")
|
help="include tests that require full models")
|
||||||
|
@ -129,8 +129,18 @@ def pytest_addoption(parser):
|
||||||
parser.addoption("--slow", action="store_true",
|
parser.addoption("--slow", action="store_true",
|
||||||
help="include slow tests")
|
help="include slow tests")
|
||||||
|
|
||||||
|
for lang in _languages + ['all']:
|
||||||
|
parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang)
|
||||||
|
|
||||||
|
|
||||||
def pytest_runtest_setup(item):
|
def pytest_runtest_setup(item):
|
||||||
for opt in ['models', 'vectors', 'slow']:
|
for opt in ['models', 'vectors', 'slow']:
|
||||||
if opt in item.keywords and not item.config.getoption("--%s" % opt):
|
if opt in item.keywords and not item.config.getoption("--%s" % opt):
|
||||||
pytest.skip("need --%s option to run" % opt)
|
pytest.skip("need --%s option to run" % opt)
|
||||||
|
|
||||||
|
# Check if test is marked with models and has arguments set, i.e. specific
|
||||||
|
# language. If so, skip test if flag not set.
|
||||||
|
if item.get_marker('models'):
|
||||||
|
for arg in item.get_marker('models').args:
|
||||||
|
if not item.config.getoption("--%s" % arg) and not item.config.getoption("--all"):
|
||||||
|
pytest.skip()
|
||||||
|
|
35
spacy/tests/lang/de/test_parser.py
Normal file
35
spacy/tests/lang/de/test_parser.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...util import get_doc
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_de_parser_noun_chunks_standard_de(de_tokenizer):
|
||||||
|
text = "Eine Tasse steht auf dem Tisch."
|
||||||
|
heads = [1, 1, 0, -1, 1, -2, -4]
|
||||||
|
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
|
||||||
|
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
|
||||||
|
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||||
|
chunks = list(doc.noun_chunks)
|
||||||
|
assert len(chunks) == 2
|
||||||
|
assert chunks[0].text_with_ws == "Eine Tasse "
|
||||||
|
assert chunks[1].text_with_ws == "dem Tisch "
|
||||||
|
|
||||||
|
|
||||||
|
def test_de_extended_chunk(de_tokenizer):
|
||||||
|
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
|
||||||
|
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
|
||||||
|
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
|
||||||
|
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
|
||||||
|
|
||||||
|
tokens = de_tokenizer(text)
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
||||||
|
chunks = list(doc.noun_chunks)
|
||||||
|
assert len(chunks) == 3
|
||||||
|
assert chunks[0].text_with_ws == "Die Sängerin "
|
||||||
|
assert chunks[1].text_with_ws == "einer Tasse Kaffee "
|
||||||
|
assert chunks[2].text_with_ws == "Arien "
|
|
@ -1,87 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
"""Test that tokens are created correctly for contractions."""
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_handles_basic_contraction(en_tokenizer):
|
|
||||||
text = "don't giggle"
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 3
|
|
||||||
assert tokens[1].text == "n't"
|
|
||||||
text = "i said don't!"
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 5
|
|
||||||
assert tokens[4].text == "!"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
|
||||||
def test_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 3
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
|
|
||||||
def test_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
|
||||||
tokens = en_tokenizer(text_poss)
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[0].text == text
|
|
||||||
assert tokens[1].text == "'s"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
|
|
||||||
def test_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[0].text == text.split("'")[0]
|
|
||||||
assert tokens[1].text == "'"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
|
|
||||||
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 1
|
|
||||||
assert tokens[0].text == text
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
|
|
||||||
def test_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert len(tokens) == 2
|
|
||||||
assert tokens[0].text == text.split("'")[0]
|
|
||||||
assert tokens[1].text == "'ll"
|
|
||||||
assert tokens[1].lemma_ == "will"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
|
|
||||||
def test_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
|
||||||
tokens_lower = en_tokenizer(text_lower)
|
|
||||||
tokens_title = en_tokenizer(text_title)
|
|
||||||
assert tokens_title[0].text == tokens_lower[0].text.title()
|
|
||||||
assert tokens_lower[0].text == tokens_title[0].text.lower()
|
|
||||||
assert tokens_lower[1].text == tokens_title[1].text
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
|
|
||||||
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
|
|
||||||
def test_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
|
||||||
tokens = en_tokenizer(pron + contraction)
|
|
||||||
assert tokens[0].text == pron
|
|
||||||
assert tokens[1].text == contraction
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
|
||||||
def test_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
|
||||||
tokens = en_tokenizer(exc)
|
|
||||||
assert len(tokens) == 1
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
|
|
||||||
def test_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
|
||||||
tokens = en_tokenizer(wo_punct)
|
|
||||||
assert len(tokens) == 2
|
|
||||||
tokens = en_tokenizer(w_punct)
|
|
||||||
assert len(tokens) == 3
|
|
|
@ -1,19 +1,96 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""Test that tokenizer exceptions are handled correctly."""
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_en_tokenizer_handles_basic_contraction(en_tokenizer):
|
||||||
|
text = "don't giggle"
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
assert tokens[1].text == "n't"
|
||||||
|
text = "i said don't!"
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 5
|
||||||
|
assert tokens[4].text == "!"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"])
|
||||||
|
def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")])
|
||||||
|
def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text):
|
||||||
|
tokens = en_tokenizer(text_poss)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == text
|
||||||
|
assert tokens[1].text == "'s"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["schools'", "Alexis'"])
|
||||||
|
def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == text.split("'")[0]
|
||||||
|
assert tokens[1].text == "'"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"])
|
||||||
|
def text_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
assert tokens[0].text == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"])
|
||||||
|
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert tokens[0].text == text.split("'")[0]
|
||||||
|
assert tokens[1].text == "'ll"
|
||||||
|
assert tokens[1].lemma_ == "will"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")])
|
||||||
|
def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title):
|
||||||
|
tokens_lower = en_tokenizer(text_lower)
|
||||||
|
tokens_title = en_tokenizer(text_title)
|
||||||
|
assert tokens_title[0].text == tokens_lower[0].text.title()
|
||||||
|
assert tokens_lower[0].text == tokens_title[0].text.lower()
|
||||||
|
assert tokens_lower[1].text == tokens_title[1].text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"])
|
||||||
|
@pytest.mark.parametrize('contraction', ["'ll", "'d"])
|
||||||
|
def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction):
|
||||||
|
tokens = en_tokenizer(pron + contraction)
|
||||||
|
assert tokens[0].text == pron
|
||||||
|
assert tokens[1].text == contraction
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"])
|
||||||
|
def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc):
|
||||||
|
tokens = en_tokenizer(exc)
|
||||||
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")])
|
||||||
|
def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct):
|
||||||
|
tokens = en_tokenizer(wo_punct)
|
||||||
|
assert len(tokens) == 2
|
||||||
|
tokens = en_tokenizer(w_punct)
|
||||||
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
|
@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."])
|
||||||
def test_tokenizer_handles_abbr(en_tokenizer, text):
|
def test_en_tokenizer_handles_abbr(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_handles_exc_in_text(en_tokenizer):
|
def test_en_tokenizer_handles_exc_in_text(en_tokenizer):
|
||||||
text = "It's mediocre i.e. bad."
|
text = "It's mediocre i.e. bad."
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 6
|
assert len(tokens) == 6
|
||||||
|
@ -21,7 +98,7 @@ def test_tokenizer_handles_exc_in_text(en_tokenizer):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["1am", "12a.m.", "11p.m.", "4pm"])
|
@pytest.mark.parametrize('text', ["1am", "12a.m.", "11p.m.", "4pm"])
|
||||||
def test_tokenizer_handles_times(en_tokenizer, text):
|
def test_en_tokenizer_handles_times(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[1].lemma_ in ["a.m.", "p.m."]
|
assert tokens[1].lemma_ in ["a.m.", "p.m."]
|
||||||
|
|
|
@ -7,7 +7,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_simple_punct(en_tokenizer):
|
def test_en_simple_punct(en_tokenizer):
|
||||||
text = "to walk, do foo"
|
text = "to walk, do foo"
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert tokens[0].idx == 0
|
assert tokens[0].idx == 0
|
||||||
|
@ -17,7 +17,7 @@ def test_simple_punct(en_tokenizer):
|
||||||
assert tokens[4].idx == 12
|
assert tokens[4].idx == 12
|
||||||
|
|
||||||
|
|
||||||
def test_complex_punct(en_tokenizer):
|
def test_en_complex_punct(en_tokenizer):
|
||||||
text = "Tom (D., Ill.)!"
|
text = "Tom (D., Ill.)!"
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert tokens[0].idx == 0
|
assert tokens[0].idx == 0
|
||||||
|
|
46
spacy/tests/lang/en/test_lemmatizer.py
Normal file
46
spacy/tests/lang/en/test_lemmatizer.py
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def en_lemmatizer(EN):
|
||||||
|
return EN.Defaults.create_lemmatizer()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
|
||||||
|
("aardwolf", ["aardwolf"]),
|
||||||
|
("planets", ["planet"]),
|
||||||
|
("ring", ["ring"]),
|
||||||
|
("axes", ["axis", "axe", "ax"])])
|
||||||
|
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
||||||
|
assert en_lemmatizer.noun(text) == set(lemmas)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_en_lemmatizer_base_forms(en_lemmatizer):
|
||||||
|
assert en_lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
|
||||||
|
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_en_lemmatizer_base_form_verb(en_lemmatizer):
|
||||||
|
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_en_lemmatizer_punct(en_lemmatizer):
|
||||||
|
assert en_lemmatizer.punct('“') == set(['"'])
|
||||||
|
assert en_lemmatizer.punct('“') == set(['"'])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_en_lemmatizer_lemma_assignment(EN):
|
||||||
|
text = "Bananas in pyjamas are geese."
|
||||||
|
doc = EN.tokenizer(text)
|
||||||
|
assert all(t.lemma_ == '' for t in doc)
|
||||||
|
EN.tagger(doc)
|
||||||
|
assert all(t.lemma_ != '' for t in doc)
|
|
@ -5,8 +5,8 @@ from spacy.attrs import LOWER
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_simple_types(EN):
|
def test_en_ner_simple_types(EN):
|
||||||
tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
|
tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
|
||||||
ents = list(tokens.ents)
|
ents = list(tokens.ents)
|
||||||
assert ents[0].start == 1
|
assert ents[0].start == 1
|
||||||
|
@ -17,8 +17,8 @@ def test_simple_types(EN):
|
||||||
assert ents[1].label_ == 'GPE'
|
assert ents[1].label_ == 'GPE'
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_consistency_bug(EN):
|
def test_en_ner_consistency_bug(EN):
|
||||||
'''Test an arbitrary sequence-consistency bug encountered during speed test'''
|
'''Test an arbitrary sequence-consistency bug encountered during speed test'''
|
||||||
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
|
tokens = EN(u'Where rap essentially went mainstream, illustrated by seminal Public Enemy, Beastie Boys and L.L. Cool J. tracks.')
|
||||||
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
|
tokens = EN(u'''Charity and other short-term aid have buoyed them so far, and a tax-relief bill working its way through Congress would help. But the September 11 Victim Compensation Fund, enacted by Congress to discourage people from filing lawsuits, will determine the shape of their lives for years to come.\n\n''', entity=False)
|
||||||
|
@ -26,8 +26,8 @@ def test_consistency_bug(EN):
|
||||||
EN.entity(tokens)
|
EN.entity(tokens)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_unit_end_gazetteer(EN):
|
def test_en_ner_unit_end_gazetteer(EN):
|
||||||
'''Test a bug in the interaction between the NER model and the gazetteer'''
|
'''Test a bug in the interaction between the NER model and the gazetteer'''
|
||||||
matcher = Matcher(EN.vocab)
|
matcher = Matcher(EN.vocab)
|
||||||
matcher.add('MemberNames', None, [{LOWER: 'cal'}], [{LOWER: 'cal'}, {LOWER: 'henderson'}])
|
matcher.add('MemberNames', None, [{LOWER: 'cal'}], [{LOWER: 'cal'}, {LOWER: 'henderson'}])
|
||||||
|
@ -38,6 +38,3 @@ def test_unit_end_gazetteer(EN):
|
||||||
doc.ents += tuple(ents)
|
doc.ents += tuple(ents)
|
||||||
EN.entity(doc)
|
EN.entity(doc)
|
||||||
assert list(doc.ents)[0].text == 'cal'
|
assert list(doc.ents)[0].text == 'cal'
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..util import get_doc
|
from ...util import get_doc
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
@ -45,32 +45,3 @@ def test_parser_noun_chunks_pp_chunks(en_tokenizer):
|
||||||
assert len(chunks) == 2
|
assert len(chunks) == 2
|
||||||
assert chunks[0].text_with_ws == "A phrase "
|
assert chunks[0].text_with_ws == "A phrase "
|
||||||
assert chunks[1].text_with_ws == "another phrase "
|
assert chunks[1].text_with_ws == "another phrase "
|
||||||
|
|
||||||
|
|
||||||
def test_parser_noun_chunks_standard_de(de_tokenizer):
|
|
||||||
text = "Eine Tasse steht auf dem Tisch."
|
|
||||||
heads = [1, 1, 0, -1, 1, -2, -4]
|
|
||||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.']
|
|
||||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct']
|
|
||||||
|
|
||||||
tokens = de_tokenizer(text)
|
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
|
||||||
chunks = list(doc.noun_chunks)
|
|
||||||
assert len(chunks) == 2
|
|
||||||
assert chunks[0].text_with_ws == "Eine Tasse "
|
|
||||||
assert chunks[1].text_with_ws == "dem Tisch "
|
|
||||||
|
|
||||||
|
|
||||||
def test_de_extended_chunk(de_tokenizer):
|
|
||||||
text = "Die Sängerin singt mit einer Tasse Kaffee Arien."
|
|
||||||
heads = [1, 1, 0, -1, 1, -2, -1, -5, -6]
|
|
||||||
tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.']
|
|
||||||
deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct']
|
|
||||||
|
|
||||||
tokens = de_tokenizer(text)
|
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags, deps=deps, heads=heads)
|
|
||||||
chunks = list(doc.noun_chunks)
|
|
||||||
assert len(chunks) == 3
|
|
||||||
assert chunks[0].text_with_ws == "Die Sängerin "
|
|
||||||
assert chunks[1].text_with_ws == "einer Tasse Kaffee "
|
|
||||||
assert chunks[2].text_with_ws == "Arien "
|
|
|
@ -16,14 +16,14 @@ PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
@pytest.mark.parametrize('text', ["(", "((", "<"])
|
||||||
def test_tokenizer_handles_only_punct(en_tokenizer, text):
|
def test_en_tokenizer_handles_only_punct(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == len(text)
|
assert len(tokens) == len(text)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
||||||
tokens = en_tokenizer(punct + text)
|
tokens = en_tokenizer(punct + text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].text == punct
|
assert tokens[0].text == punct
|
||||||
|
@ -32,7 +32,7 @@ def test_tokenizer_splits_open_punct(en_tokenizer, punct, text):
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
||||||
tokens = en_tokenizer(text + punct)
|
tokens = en_tokenizer(text + punct)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].text == text
|
assert tokens[0].text == text
|
||||||
|
@ -42,7 +42,7 @@ def test_tokenizer_splits_close_punct(en_tokenizer, punct, text):
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('punct_add', ["`"])
|
@pytest.mark.parametrize('punct_add', ["`"])
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
|
def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text):
|
||||||
tokens = en_tokenizer(punct + punct_add + text)
|
tokens = en_tokenizer(punct + punct_add + text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[0].text == punct
|
assert tokens[0].text == punct
|
||||||
|
@ -53,7 +53,7 @@ def test_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, te
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('punct_add', ["'"])
|
@pytest.mark.parametrize('punct_add', ["'"])
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
|
def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text):
|
||||||
tokens = en_tokenizer(text + punct + punct_add)
|
tokens = en_tokenizer(text + punct + punct_add)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[0].text == text
|
assert tokens[0].text == text
|
||||||
|
@ -63,7 +63,7 @@ def test_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, t
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
@pytest.mark.parametrize('punct', PUNCT_OPEN)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
||||||
tokens = en_tokenizer(punct + punct + punct + text)
|
tokens = en_tokenizer(punct + punct + punct + text)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
assert tokens[0].text == punct
|
assert tokens[0].text == punct
|
||||||
|
@ -72,7 +72,7 @@ def test_tokenizer_splits_same_open_punct(en_tokenizer, punct, text):
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
@pytest.mark.parametrize('punct', PUNCT_CLOSE)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
||||||
tokens = en_tokenizer(text + punct + punct + punct)
|
tokens = en_tokenizer(text + punct + punct + punct)
|
||||||
assert len(tokens) == 4
|
assert len(tokens) == 4
|
||||||
assert tokens[0].text == text
|
assert tokens[0].text == text
|
||||||
|
@ -80,14 +80,14 @@ def test_tokenizer_splits_same_close_punct(en_tokenizer, punct, text):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["'The"])
|
@pytest.mark.parametrize('text', ["'The"])
|
||||||
def test_tokenizer_splits_open_appostrophe(en_tokenizer, text):
|
def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].text == "'"
|
assert tokens[0].text == "'"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["Hello''"])
|
@pytest.mark.parametrize('text', ["Hello''"])
|
||||||
def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
tokens_punct = en_tokenizer("''")
|
tokens_punct = en_tokenizer("''")
|
||||||
|
@ -96,7 +96,7 @@ def test_tokenizer_splits_double_end_quote(en_tokenizer, text):
|
||||||
|
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
||||||
punct_close, text):
|
punct_close, text):
|
||||||
tokens = en_tokenizer(punct_open + text + punct_close)
|
tokens = en_tokenizer(punct_open + text + punct_close)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
@ -108,7 +108,7 @@ def test_tokenizer_splits_open_close_punct(en_tokenizer, punct_open,
|
||||||
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED)
|
||||||
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")])
|
||||||
@pytest.mark.parametrize('text', ["Hello"])
|
@pytest.mark.parametrize('text', ["Hello"])
|
||||||
def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
||||||
punct_open2, punct_close2, text):
|
punct_open2, punct_close2, text):
|
||||||
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2)
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
|
@ -120,13 +120,13 @@ def test_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close,
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
@pytest.mark.parametrize('text,punct', [("(can't", "(")])
|
||||||
def test_tokenizer_splits_pre_punct_regex(text, punct):
|
def test_en_tokenizer_splits_pre_punct_regex(text, punct):
|
||||||
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search
|
||||||
match = en_search_prefixes(text)
|
match = en_search_prefixes(text)
|
||||||
assert match.group() == punct
|
assert match.group() == punct
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_splits_bracket_period(en_tokenizer):
|
def test_en_tokenizer_splits_bracket_period(en_tokenizer):
|
||||||
text = "(And a 6a.m. run through Washington Park)."
|
text = "(And a 6a.m. run through Washington Park)."
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert tokens[len(tokens) - 1].text == "."
|
assert tokens[len(tokens) - 1].text == "."
|
||||||
|
|
|
@ -1,9 +1,65 @@
|
||||||
# encoding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ....tokens import Doc
|
||||||
|
from ...util import get_doc, apply_transition_sequence
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["A test sentence"])
|
||||||
|
@pytest.mark.parametrize('punct', ['.', '!', '?', ''])
|
||||||
|
def test_en_sbd_single_punct(en_tokenizer, text, punct):
|
||||||
|
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
||||||
|
tokens = en_tokenizer(text + punct)
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
||||||
|
assert len(doc) == 4 if punct else 3
|
||||||
|
assert len(list(doc.sents)) == 1
|
||||||
|
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_en_sentence_breaks(en_tokenizer, en_parser):
|
||||||
|
text = "This is a sentence . This is another one ."
|
||||||
|
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
|
||||||
|
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
|
||||||
|
'attr', 'punct']
|
||||||
|
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT',
|
||||||
|
'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct']
|
||||||
|
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
||||||
|
apply_transition_sequence(en_parser, doc, transition)
|
||||||
|
|
||||||
|
assert len(list(doc.sents)) == 2
|
||||||
|
for token in doc:
|
||||||
|
assert token.dep != 0 or token.is_space
|
||||||
|
assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
|
||||||
|
|
||||||
|
|
||||||
|
# Currently, there's no way of setting the serializer data for the parser
|
||||||
|
# without loading the models, so we can't remove the model dependency here yet.
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_en_sbd_serialization_projective(EN):
|
||||||
|
"""Test that before and after serialization, the sentence boundaries are
|
||||||
|
the same."""
|
||||||
|
|
||||||
|
text = "I bought a couch from IKEA It wasn't very comfortable."
|
||||||
|
transition = ['L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj',
|
||||||
|
'B-ROOT', 'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod',
|
||||||
|
'R-acomp', 'D', 'R-punct']
|
||||||
|
|
||||||
|
doc = EN.tokenizer(text)
|
||||||
|
apply_transition_sequence(EN.parser, doc, transition)
|
||||||
|
doc_serialized = Doc(EN.vocab).from_bytes(doc.to_bytes())
|
||||||
|
assert doc.is_parsed == True
|
||||||
|
assert doc_serialized.is_parsed == True
|
||||||
|
assert doc.to_bytes() == doc_serialized.to_bytes()
|
||||||
|
assert [s.text for s in doc.sents] == [s.text for s in doc_serialized.sents]
|
||||||
|
|
||||||
|
|
||||||
TEST_CASES = [
|
TEST_CASES = [
|
||||||
("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]),
|
("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]),
|
||||||
("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
|
("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
|
||||||
|
@ -59,10 +115,9 @@ TEST_CASES = [
|
||||||
pytest.mark.xfail(("Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.", ["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."]))
|
pytest.mark.xfail(("Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot.", ["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."]))
|
||||||
]
|
]
|
||||||
|
|
||||||
@pytest.mark.slow
|
@pytest.mark.models('en')
|
||||||
@pytest.mark.models
|
|
||||||
@pytest.mark.parametrize('text,expected_sents', TEST_CASES)
|
@pytest.mark.parametrize('text,expected_sents', TEST_CASES)
|
||||||
def test_parser_sbd_prag(EN, text, expected_sents):
|
def test_en_sbd_prag(EN, text, expected_sents):
|
||||||
"""SBD tests from Pragmatic Segmenter"""
|
"""SBD tests from Pragmatic Segmenter"""
|
||||||
doc = EN(text)
|
doc = EN(text)
|
||||||
sents = []
|
sents = []
|
59
spacy/tests/lang/en/test_tagger.py
Normal file
59
spacy/tests/lang/en/test_tagger.py
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ....parts_of_speech import SPACE
|
||||||
|
from ...util import get_doc
|
||||||
|
|
||||||
|
import six
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_en_tagger_load_morph_exc(en_tokenizer):
|
||||||
|
text = "I like his style."
|
||||||
|
tags = ['PRP', 'VBP', 'PRP$', 'NN', '.']
|
||||||
|
morph_exc = {'VBP': {'like': {'L': 'luck'}}}
|
||||||
|
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags)
|
||||||
|
assert doc[1].tag_ == 'VBP'
|
||||||
|
assert doc[1].lemma_ == 'luck'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_tag_names(EN):
|
||||||
|
text = "I ate pizzas with anchovies."
|
||||||
|
doc = EN(text, parse=False, tag=True)
|
||||||
|
assert type(doc[2].pos) == int
|
||||||
|
assert isinstance(doc[2].pos_, six.text_type)
|
||||||
|
assert type(doc[2].dep) == int
|
||||||
|
assert isinstance(doc[2].dep_, six.text_type)
|
||||||
|
assert doc[2].tag_ == u'NNS'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_en_tagger_spaces(EN):
|
||||||
|
"""Ensure spaces are assigned the POS tag SPACE"""
|
||||||
|
text = "Some\nspaces are\tnecessary."
|
||||||
|
doc = EN(text, tag=True, parse=False)
|
||||||
|
assert doc[0].pos != SPACE
|
||||||
|
assert doc[0].pos_ != 'SPACE'
|
||||||
|
assert doc[1].pos == SPACE
|
||||||
|
assert doc[1].pos_ == 'SPACE'
|
||||||
|
assert doc[1].tag_ == 'SP'
|
||||||
|
assert doc[2].pos != SPACE
|
||||||
|
assert doc[3].pos != SPACE
|
||||||
|
assert doc[4].pos == SPACE
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models('en')
|
||||||
|
def test_en_tagger_return_char(EN):
|
||||||
|
"""Ensure spaces are assigned the POS tag SPACE"""
|
||||||
|
text = ('hi Aaron,\r\n\r\nHow is your schedule today, I was wondering if '
|
||||||
|
'you had time for a phone\r\ncall this afternoon?\r\n\r\n\r\n')
|
||||||
|
tokens = EN(text)
|
||||||
|
for token in tokens:
|
||||||
|
if token.is_space:
|
||||||
|
assert token.pos == SPACE
|
||||||
|
assert tokens[3].text == '\r\n\r\n'
|
||||||
|
assert tokens[3].is_space
|
||||||
|
assert tokens[3].pos == SPACE
|
|
@ -7,7 +7,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer_handles_long_text(en_tokenizer):
|
def test_en_tokenizer_handles_long_text(en_tokenizer):
|
||||||
text = """Tributes pour in for late British Labour Party leader
|
text = """Tributes pour in for late British Labour Party leader
|
||||||
|
|
||||||
Tributes poured in from around the world Thursday
|
Tributes poured in from around the world Thursday
|
||||||
|
@ -30,7 +30,7 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||||
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
("""'Me too!', Mr. P. Delaware cried. """, 11),
|
||||||
("They ran about 10km.", 6),
|
("They ran about 10km.", 6),
|
||||||
pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))])
|
pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))])
|
||||||
def test_tokenizer_handles_cnts(en_tokenizer, text, length):
|
def test_en_tokenizer_handles_cnts(en_tokenizer, text, length):
|
||||||
tokens = en_tokenizer(text)
|
tokens = en_tokenizer(text)
|
||||||
assert len(tokens) == length
|
assert len(tokens) == length
|
||||||
|
|
||||||
|
|
|
@ -1,37 +1,33 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('fr')
|
||||||
def test_lemmatizer_verb(FR):
|
def test_lemmatizer_verb(FR):
|
||||||
text = "Qu'est-ce que tu fais?"
|
tokens = FR("Qu'est-ce que tu fais?")
|
||||||
tokens = FR(text)
|
|
||||||
assert tokens[0].lemma_ == "que"
|
assert tokens[0].lemma_ == "que"
|
||||||
assert tokens[1].lemma_ == "être"
|
assert tokens[1].lemma_ == "être"
|
||||||
assert tokens[5].lemma_ == "faire"
|
assert tokens[5].lemma_ == "faire"
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
|
@pytest.mark.models('fr')
|
||||||
@pytest.mark.xfail(reason="sont tagged as AUX")
|
@pytest.mark.xfail(reason="sont tagged as AUX")
|
||||||
def test_lemmatizer_noun_verb_2(FR):
|
def test_lemmatizer_noun_verb_2(FR):
|
||||||
text = "Les abaissements de température sont gênants."
|
tokens = FR("Les abaissements de température sont gênants.")
|
||||||
tokens = FR(text)
|
|
||||||
assert tokens[4].lemma_ == "être"
|
assert tokens[4].lemma_ == "être"
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
|
@pytest.mark.models('fr')
|
||||||
@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN")
|
@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN")
|
||||||
def test_lemmatizer_noun(FR):
|
def test_lemmatizer_noun(model):
|
||||||
text = "il y a des Costaricienne."
|
tokens = FR("il y a des Costaricienne.")
|
||||||
tokens = FR(text)
|
|
||||||
assert tokens[4].lemma_ == "Costaricain"
|
assert tokens[4].lemma_ == "Costaricain"
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
|
@pytest.mark.models('fr')
|
||||||
def test_lemmatizer_noun_2(FR):
|
def test_lemmatizer_noun_2(FR):
|
||||||
text = "Les abaissements de température sont gênants."
|
tokens = FR("Les abaissements de température sont gênants.")
|
||||||
tokens = FR(text)
|
|
||||||
assert tokens[1].lemma_ == "abaissement"
|
assert tokens[1].lemma_ == "abaissement"
|
||||||
assert tokens[5].lemma_ == "gênant"
|
assert tokens[5].lemma_ == "gênant"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,60 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...tokens import Doc
|
|
||||||
from ..util import get_doc, apply_transition_sequence
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text', ["A test sentence"])
|
|
||||||
@pytest.mark.parametrize('punct', ['.', '!', '?', ''])
|
|
||||||
def test_parser_sbd_single_punct(en_tokenizer, text, punct):
|
|
||||||
heads = [2, 1, 0, -1] if punct else [2, 1, 0]
|
|
||||||
tokens = en_tokenizer(text + punct)
|
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
|
|
||||||
assert len(doc) == 4 if punct else 3
|
|
||||||
assert len(list(doc.sents)) == 1
|
|
||||||
assert sum(len(sent) for sent in doc.sents) == len(doc)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_parser_sentence_breaks(en_tokenizer, en_parser):
|
|
||||||
text = "This is a sentence . This is another one ."
|
|
||||||
heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
|
|
||||||
deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
|
|
||||||
'attr', 'punct']
|
|
||||||
transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT',
|
|
||||||
'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct']
|
|
||||||
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps)
|
|
||||||
apply_transition_sequence(en_parser, doc, transition)
|
|
||||||
|
|
||||||
assert len(list(doc.sents)) == 2
|
|
||||||
for token in doc:
|
|
||||||
assert token.dep != 0 or token.is_space
|
|
||||||
assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]
|
|
||||||
|
|
||||||
|
|
||||||
# Currently, there's no way of setting the serializer data for the parser
|
|
||||||
# without loading the models, so we can't remove the model dependency here yet.
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_parser_sbd_serialization_projective(EN):
|
|
||||||
"""Test that before and after serialization, the sentence boundaries are
|
|
||||||
the same."""
|
|
||||||
|
|
||||||
text = "I bought a couch from IKEA It wasn't very comfortable."
|
|
||||||
transition = ['L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj',
|
|
||||||
'B-ROOT', 'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod',
|
|
||||||
'R-acomp', 'D', 'R-punct']
|
|
||||||
|
|
||||||
doc = EN.tokenizer(text)
|
|
||||||
apply_transition_sequence(EN.parser, doc, transition)
|
|
||||||
doc_serialized = Doc(EN.vocab).from_bytes(doc.to_bytes())
|
|
||||||
assert doc.is_parsed == True
|
|
||||||
assert doc_serialized.is_parsed == True
|
|
||||||
assert doc.to_bytes() == doc_serialized.to_bytes()
|
|
||||||
assert [s.text for s in doc.sents] == [s.text for s in doc_serialized.sents]
|
|
|
@ -1,17 +1,11 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from ...pipeline import NeuralDependencyParser
|
from ...pipeline import NeuralDependencyParser
|
||||||
from ...vocab import Vocab
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def vocab():
|
def parser(en_vocab):
|
||||||
return Vocab()
|
parser = NeuralDependencyParser(en_vocab)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def parser(vocab):
|
|
||||||
parser = NeuralDependencyParser(vocab)
|
|
||||||
parser.add_label('nsubj')
|
parser.add_label('nsubj')
|
||||||
parser.model, cfg = parser.Model(parser.moves.n_moves)
|
parser.model, cfg = parser.Model(parser.moves.n_moves)
|
||||||
parser.cfg.update(cfg)
|
parser.cfg.update(cfg)
|
||||||
|
@ -19,8 +13,8 @@ def parser(vocab):
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def blank_parser(vocab):
|
def blank_parser(en_vocab):
|
||||||
parser = NeuralDependencyParser(vocab)
|
parser = NeuralDependencyParser(en_vocab)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
@pytest.mark.parametrize('text,i', [("Jane's got a new car", 1),
|
@pytest.mark.parametrize('text,i', [("Jane's got a new car", 1),
|
||||||
("Jane thinks that's a nice car", 3)])
|
("Jane thinks that's a nice car", 3)])
|
||||||
def test_issue401(EN, text, i):
|
def test_issue401(EN, text, i):
|
||||||
|
|
|
@ -6,7 +6,7 @@ from ...matcher import Matcher
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_issue429(EN):
|
def test_issue429(EN):
|
||||||
def merge_phrases(matcher, doc, i, matches):
|
def merge_phrases(matcher, doc, i, matches):
|
||||||
if i != len(matches) - 1:
|
if i != len(matches) - 1:
|
||||||
|
|
|
@ -6,7 +6,7 @@ from ..util import get_doc
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_issue514(EN):
|
def test_issue514(EN):
|
||||||
"""Test serializing after adding entity"""
|
"""Test serializing after adding entity"""
|
||||||
text = ["This", "is", "a", "sentence", "about", "pasta", "."]
|
text = ["This", "is", "a", "sentence", "about", "pasta", "."]
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_issue54(EN):
|
def test_issue54(EN):
|
||||||
text = "Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1)."
|
text = "Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1)."
|
||||||
tokens = EN(text)
|
tokens = EN(text)
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
@pytest.mark.parametrize('text', ["He is the man", "he is the man"])
|
@pytest.mark.parametrize('text', ["He is the man", "he is the man"])
|
||||||
def test_issue686(EN, text):
|
def test_issue686(EN, text):
|
||||||
"""Test that pronoun lemmas are assigned correctly."""
|
"""Test that pronoun lemmas are assigned correctly."""
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_issue693(EN):
|
def test_issue693(EN):
|
||||||
"""Test that doc.noun_chunks parses the complete sentence."""
|
"""Test that doc.noun_chunks parses the complete sentence."""
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_issue704(EN):
|
def test_issue704(EN):
|
||||||
"""Test that sentence boundaries are detected correctly."""
|
"""Test that sentence boundaries are detected correctly."""
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
@pytest.mark.parametrize('text1,text2',
|
@pytest.mark.parametrize('text1,text2',
|
||||||
[("You're happy", "You are happy"),
|
[("You're happy", "You are happy"),
|
||||||
("I'm happy", "I am happy"),
|
("I'm happy", "I am happy"),
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
@pytest.mark.parametrize('text', ["s..."])
|
@pytest.mark.parametrize('text', ["s..."])
|
||||||
def test_issue719(EN, text):
|
def test_issue719(EN, text):
|
||||||
"""Test that the token 's' is not lemmatized into empty string."""
|
"""Test that the token 's' is not lemmatized into empty string."""
|
||||||
|
|
|
@ -4,7 +4,7 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_issue758(EN):
|
def test_issue758(EN):
|
||||||
'''Test parser transition bug after label added.'''
|
'''Test parser transition bug after label added.'''
|
||||||
from ...matcher import merge_phrase
|
from ...matcher import merge_phrase
|
||||||
|
|
|
@ -5,6 +5,8 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
# Note: "chromosomes" worked previous the bug fix
|
# Note: "chromosomes" worked previous the bug fix
|
||||||
|
@pytest.mark.models('en')
|
||||||
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
|
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
|
||||||
def test_issue781(lemmatizer, word, lemmas):
|
def test_issue781(EN, word, lemmas):
|
||||||
|
lemmatizer = EN.Defaults.create_lemmatizer()
|
||||||
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)
|
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)
|
||||||
|
|
|
@ -70,8 +70,8 @@ def temp_save_model(model):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_issue910(train_data, additional_entity_types):
|
def test_issue910(EN, train_data, additional_entity_types):
|
||||||
'''Test that adding entities and resuming training works passably OK.
|
'''Test that adding entities and resuming training works passably OK.
|
||||||
There are two issues here:
|
There are two issues here:
|
||||||
|
|
||||||
|
@ -79,8 +79,7 @@ def test_issue910(train_data, additional_entity_types):
|
||||||
2) There's no way to set the learning rate for the weight update, so we
|
2) There's no way to set the learning rate for the weight update, so we
|
||||||
end up out-of-scale, causing it to learn too fast.
|
end up out-of-scale, causing it to learn too fast.
|
||||||
'''
|
'''
|
||||||
nlp = English()
|
doc = EN(u"I am looking for a restaurant in Berlin")
|
||||||
doc = nlp(u"I am looking for a restaurant in Berlin")
|
|
||||||
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
|
ents_before_train = [(ent.label_, ent.text) for ent in doc.ents]
|
||||||
# Fine tune the ner model
|
# Fine tune the ner model
|
||||||
for entity_type in additional_entity_types:
|
for entity_type in additional_entity_types:
|
||||||
|
|
|
@ -1,18 +1,13 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from ... import load as load_spacy
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def doc():
|
|
||||||
nlp = load_spacy('en')
|
|
||||||
return nlp('Does flight number three fifty-four require a connecting flight'
|
|
||||||
' to get to Boston?')
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models('en')
|
||||||
def test_issue955(doc):
|
def test_issue955(EN, doc):
|
||||||
'''Test that we don't have any nested noun chunks'''
|
'''Test that we don't have any nested noun chunks'''
|
||||||
|
doc = EN('Does flight number three fifty-four require a connecting flight'
|
||||||
|
' to get to Boston?')
|
||||||
seen_tokens = set()
|
seen_tokens = set()
|
||||||
for np in doc.noun_chunks:
|
for np in doc.noun_chunks:
|
||||||
print(np.text, np.root.text, np.root.dep_, np.root.tag_)
|
print(np.text, np.root.text, np.root.dep_, np.root.tag_)
|
||||||
|
|
|
@ -1,49 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
@pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
|
|
||||||
("aardwolf", ["aardwolf"]),
|
|
||||||
("planets", ["planet"]),
|
|
||||||
("ring", ["ring"]),
|
|
||||||
("axes", ["axis", "axe", "ax"])])
|
|
||||||
def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas):
|
|
||||||
if lemmatizer is None:
|
|
||||||
return None
|
|
||||||
assert lemmatizer.noun(text) == set(lemmas)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_tagger_lemmatizer_base_forms(lemmatizer):
|
|
||||||
if lemmatizer is None:
|
|
||||||
return None
|
|
||||||
assert lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
|
|
||||||
assert lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_tagger_lemmatizer_base_form_verb(lemmatizer):
|
|
||||||
if lemmatizer is None:
|
|
||||||
return None
|
|
||||||
assert lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_tagger_lemmatizer_punct(lemmatizer):
|
|
||||||
if lemmatizer is None:
|
|
||||||
return None
|
|
||||||
assert lemmatizer.punct('“') == set(['"'])
|
|
||||||
assert lemmatizer.punct('“') == set(['"'])
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_tagger_lemmatizer_lemma_assignment(EN):
|
|
||||||
text = "Bananas in pyjamas are geese."
|
|
||||||
doc = EN.tokenizer(text)
|
|
||||||
assert all(t.lemma_ == '' for t in doc)
|
|
||||||
EN.tagger(doc)
|
|
||||||
assert all(t.lemma_ != '' for t in doc)
|
|
|
@ -1,17 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_tagger_load_morph_exc(en_tokenizer):
|
|
||||||
text = "I like his style."
|
|
||||||
tags = ['PRP', 'VBP', 'PRP$', 'NN', '.']
|
|
||||||
morph_exc = {'VBP': {'like': {'L': 'luck'}}}
|
|
||||||
en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc)
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags)
|
|
||||||
assert doc[1].tag_ == 'VBP'
|
|
||||||
assert doc[1].lemma_ == 'luck'
|
|
|
@ -1,35 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
"""Ensure spaces are assigned the POS tag SPACE"""
|
|
||||||
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from ...parts_of_speech import SPACE
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_tagger_spaces(EN):
|
|
||||||
text = "Some\nspaces are\tnecessary."
|
|
||||||
doc = EN(text, tag=True, parse=False)
|
|
||||||
assert doc[0].pos != SPACE
|
|
||||||
assert doc[0].pos_ != 'SPACE'
|
|
||||||
assert doc[1].pos == SPACE
|
|
||||||
assert doc[1].pos_ == 'SPACE'
|
|
||||||
assert doc[1].tag_ == 'SP'
|
|
||||||
assert doc[2].pos != SPACE
|
|
||||||
assert doc[3].pos != SPACE
|
|
||||||
assert doc[4].pos == SPACE
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_tagger_return_char(EN):
|
|
||||||
text = ('hi Aaron,\r\n\r\nHow is your schedule today, I was wondering if '
|
|
||||||
'you had time for a phone\r\ncall this afternoon?\r\n\r\n\r\n')
|
|
||||||
tokens = EN(text)
|
|
||||||
for token in tokens:
|
|
||||||
if token.is_space:
|
|
||||||
assert token.pos == SPACE
|
|
||||||
assert tokens[3].text == '\r\n\r\n'
|
|
||||||
assert tokens[3].is_space
|
|
||||||
assert tokens[3].pos == SPACE
|
|
|
@ -1,16 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import six
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_tag_names(EN):
|
|
||||||
text = "I ate pizzas with anchovies."
|
|
||||||
doc = EN(text, parse=False, tag=True)
|
|
||||||
assert type(doc[2].pos) == int
|
|
||||||
assert isinstance(doc[2].pos_, six.text_type)
|
|
||||||
assert type(doc[2].dep) == int
|
|
||||||
assert isinstance(doc[2].dep_, six.text_type)
|
|
||||||
assert doc[2].tag_ == u'NNS'
|
|
|
@ -20,6 +20,7 @@ def test_util_ensure_path_succeeds(text):
|
||||||
assert isinstance(path, Path)
|
assert isinstance(path, Path)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_simple_model_roundtrip_bytes():
|
def test_simple_model_roundtrip_bytes():
|
||||||
model = Maxout(5, 10, pieces=2)
|
model = Maxout(5, 10, pieces=2)
|
||||||
model.b += 1
|
model.b += 1
|
||||||
|
@ -29,6 +30,7 @@ def test_simple_model_roundtrip_bytes():
|
||||||
assert model.b[0, 0] == 1
|
assert model.b[0, 0] == 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_multi_model_roundtrip_bytes():
|
def test_multi_model_roundtrip_bytes():
|
||||||
model = chain(Maxout(5, 10, pieces=2), Maxout(2, 3))
|
model = chain(Maxout(5, 10, pieces=2), Maxout(2, 3))
|
||||||
model._layers[0].b += 1
|
model._layers[0].b += 1
|
||||||
|
@ -41,6 +43,7 @@ def test_multi_model_roundtrip_bytes():
|
||||||
assert model._layers[1].b[0, 0] == 2
|
assert model._layers[1].b[0, 0] == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
def test_multi_model_load_missing_dims():
|
def test_multi_model_load_missing_dims():
|
||||||
model = chain(Maxout(5, 10, pieces=2), Maxout(2, 3))
|
model = chain(Maxout(5, 10, pieces=2), Maxout(2, 3))
|
||||||
model._layers[0].b += 1
|
model._layers[0].b += 1
|
||||||
|
|
|
@ -4,9 +4,20 @@ from __future__ import unicode_literals
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import ORTH, POS, HEAD, DEP
|
from ..attrs import ORTH, POS, HEAD, DEP
|
||||||
|
|
||||||
|
import pytest
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
|
|
||||||
|
MODELS = {}
|
||||||
|
|
||||||
|
|
||||||
|
def load_test_model(model):
|
||||||
|
if model not in MODELS:
|
||||||
|
module = pytest.importorskip(model)
|
||||||
|
MODELS[model] = module.load()
|
||||||
|
return MODELS[model]
|
||||||
|
|
||||||
|
|
||||||
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
|
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
|
||||||
"""Create Doc object from given vocab, words and annotations."""
|
"""Create Doc object from given vocab, words and annotations."""
|
||||||
pos = pos or [''] * len(words)
|
pos = pos or [''] * len(words)
|
||||||
|
|
|
@ -2,9 +2,16 @@
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
|
+aside("Help us improve the docs")
|
||||||
|
| Did you spot a mistake or come across explanations that
|
||||||
|
| are unclear? We always appreciate improvement
|
||||||
|
| #[+a(gh("spaCy") + "/issues") suggestions] or
|
||||||
|
| #[+a(gh("spaCy") + "/pulls") pull requests]. You can find a "Suggest
|
||||||
|
| edits" link at the bottom of each page that points you to the source.
|
||||||
|
|
||||||
+h(2, "whats-spacy") What's spaCy?
|
+h(2, "whats-spacy") What's spaCy?
|
||||||
|
|
||||||
+grid
|
+grid.o-no-block
|
||||||
+grid-col("half")
|
+grid-col("half")
|
||||||
|
|
||||||
+grid-col("half")
|
+grid-col("half")
|
||||||
|
@ -52,8 +59,8 @@ p
|
||||||
+row
|
+row
|
||||||
+cell #[strong Dependency Parsing]
|
+cell #[strong Dependency Parsing]
|
||||||
+cell
|
+cell
|
||||||
| Assigning syntactic dependency labels, i.e. the relations between
|
| Assigning syntactic dependency labels, describing the relations
|
||||||
| individual tokens.
|
| between individual tokens, like subject or object.
|
||||||
+cell #[+procon("pro")]
|
+cell #[+procon("pro")]
|
||||||
|
|
||||||
+row
|
+row
|
||||||
|
@ -374,6 +381,8 @@ p
|
||||||
| on GitHub, which we use to tag bugs and feature requests that are easy
|
| on GitHub, which we use to tag bugs and feature requests that are easy
|
||||||
| and self-contained. We also appreciate contributions to the docs – whether
|
| and self-contained. We also appreciate contributions to the docs – whether
|
||||||
| it's fixing a typo, improving an example or adding additional explanations.
|
| it's fixing a typo, improving an example or adding additional explanations.
|
||||||
|
| You'll find a "Suggest edits" link at the bottom of each page that points
|
||||||
|
| you to the source.
|
||||||
|
|
||||||
p
|
p
|
||||||
| Another way of getting involved is to help us improve the
|
| Another way of getting involved is to help us improve the
|
||||||
|
|
Loading…
Reference in New Issue
Block a user