2017-01-10 21:24:10 +03:00
|
|
|
# coding: utf-8
|
2016-12-18 15:28:51 +03:00
|
|
|
from __future__ import unicode_literals
|
2016-11-04 02:27:32 +03:00
|
|
|
|
2016-12-18 15:28:51 +03:00
|
|
|
from ...symbols import POS, VERB, VerbForm_inf
|
|
|
|
from ...vocab import Vocab
|
|
|
|
from ...lemmatizer import Lemmatizer
|
2017-01-13 00:00:37 +03:00
|
|
|
from ..util import get_doc
|
2016-11-04 02:27:32 +03:00
|
|
|
|
2017-01-10 21:24:10 +03:00
|
|
|
import pytest
|
|
|
|
|
2016-11-04 02:27:32 +03:00
|
|
|
|
2017-01-13 00:00:37 +03:00
|
|
|
def test_issue595():
|
|
|
|
"""Test lemmatization of base forms"""
|
|
|
|
words = ["Do", "n't", "feed", "the", "dog"]
|
2017-03-26 00:35:07 +03:00
|
|
|
tag_map = {'VB': {POS: VERB, VerbForm_inf: True}}
|
2017-01-13 00:00:37 +03:00
|
|
|
rules = {"verb": [["ed", "e"]]}
|
2016-12-18 15:28:51 +03:00
|
|
|
|
2017-01-13 00:00:37 +03:00
|
|
|
lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules)
|
|
|
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
|
|
|
doc = get_doc(vocab, words)
|
2016-12-18 15:28:51 +03:00
|
|
|
|
2017-01-13 00:00:37 +03:00
|
|
|
doc[2].tag_ = 'VB'
|
|
|
|
assert doc[2].text == 'feed'
|
|
|
|
assert doc[2].lemma_ == 'feed'
|