Fix inconsistant lemmatizer issue #3484 (#3646)

* Fix inconsistant lemmatizer issue #3484

* Remove test case
This commit is contained in:
Brad Jascob 2019-05-04 10:16:03 -06:00 committed by Ines Montani
parent b4d142e3c4
commit 955b95cb8b

View File

@ -1,5 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from collections import OrderedDict
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
@ -118,8 +119,8 @@ def lemmatize(string, index, exceptions, rules):
forms.append(form)
else:
oov_forms.append(form)
# Remove duplicates, and sort forms generated by rules alphabetically.
forms = list(set(forms))
# Remove duplicates but preserve the ordering of applied "rules"
forms = list(OrderedDict.fromkeys(forms))
# Put exceptions at the front of the list, so they get priority.
# This is a dodgy heuristic -- but it's the best we can do until we get
# frequencies on this. We can at least prune out problematic exceptions,