2018-07-25 00:38:44 +03:00
|
|
|
# coding: utf-8
|
2017-10-11 04:21:23 +03:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2018-07-25 00:38:44 +03:00
|
|
|
import pytest
|
|
|
|
from spacy.vocab import Vocab
|
|
|
|
from spacy.tokens import Doc
|
|
|
|
from spacy.lemmatizer import Lemmatizer
|
2017-10-11 04:21:23 +03:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def lemmatizer():
|
2017-10-11 14:27:18 +03:00
|
|
|
return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
|
2017-10-11 04:21:23 +03:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def vocab(lemmatizer):
|
|
|
|
return Vocab(lemmatizer=lemmatizer)
|
|
|
|
|
|
|
|
|
|
|
|
def test_empty_doc(vocab):
|
|
|
|
doc = Doc(vocab)
|
|
|
|
assert len(doc) == 0
|
|
|
|
|
|
|
|
|
|
|
|
def test_single_word(vocab):
|
|
|
|
doc = Doc(vocab, words=['a'])
|
|
|
|
assert doc.text == 'a '
|
|
|
|
doc = Doc(vocab, words=['a'], spaces=[False])
|
|
|
|
assert doc.text == 'a'
|
|
|
|
|
|
|
|
|
|
|
|
def test_lookup_lemmatization(vocab):
|
|
|
|
doc = Doc(vocab, words=['dogs', 'dogses'])
|
|
|
|
assert doc[0].text == 'dogs'
|
|
|
|
assert doc[0].lemma_ == 'dog'
|
|
|
|
assert doc[1].text == 'dogses'
|
|
|
|
assert doc[1].lemma_ == 'dogses'
|