mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-13 05:07:03 +03:00
* Fix inconsistant lemmatizer issue #3484 * Remove test case
This commit is contained in:
parent
b4d142e3c4
commit
955b95cb8b
|
@ -1,5 +1,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
|
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
|
||||||
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||||
|
@ -118,8 +119,8 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
forms.append(form)
|
forms.append(form)
|
||||||
else:
|
else:
|
||||||
oov_forms.append(form)
|
oov_forms.append(form)
|
||||||
# Remove duplicates, and sort forms generated by rules alphabetically.
|
# Remove duplicates but preserve the ordering of applied "rules"
|
||||||
forms = list(set(forms))
|
forms = list(OrderedDict.fromkeys(forms))
|
||||||
# Put exceptions at the front of the list, so they get priority.
|
# Put exceptions at the front of the list, so they get priority.
|
||||||
# This is a dodgy heuristic -- but it's the best we can do until we get
|
# This is a dodgy heuristic -- but it's the best we can do until we get
|
||||||
# frequencies on this. We can at least prune out problematic exceptions,
|
# frequencies on this. We can at least prune out problematic exceptions,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user