Don't lower-case lemmas of proper nouns

This commit is contained in:
Matthew Honnibal 2018-02-21 16:01:16 +01:00
parent a0ddb803fd
commit 0872cf611d

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
@ -27,11 +27,13 @@ class Lemmatizer(object):
univ_pos = 'adj'
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
univ_pos = 'punct'
elif univ_pos in (PROPN, 'PROPN'):
return [string]
else:
return list(set([string.lower()]))
return [string.lower()]
# See Issue #435 for example of where this logic is requied.
if self.is_base_form(univ_pos, morphology):
return list(set([string.lower()]))
return [string.lower()]
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
self.exc.get(univ_pos, {}),
self.rules.get(univ_pos, []))
@ -88,6 +90,7 @@ class Lemmatizer(object):
def lemmatize(string, index, exceptions, rules):
orig = string
string = string.lower()
forms = []
forms.extend(exceptions.get(string, []))
@ -105,5 +108,5 @@ def lemmatize(string, index, exceptions, rules):
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(string)
forms.append(orig)
return list(set(forms))