mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-15 10:42:34 +03:00
Don't lower-case lemmas of proper nouns
This commit is contained in:
parent
a0ddb803fd
commit
0872cf611d
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN
|
||||||
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,11 +27,13 @@ class Lemmatizer(object):
|
||||||
univ_pos = 'adj'
|
univ_pos = 'adj'
|
||||||
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
|
||||||
univ_pos = 'punct'
|
univ_pos = 'punct'
|
||||||
|
elif univ_pos in (PROPN, 'PROPN'):
|
||||||
|
return [string]
|
||||||
else:
|
else:
|
||||||
return list(set([string.lower()]))
|
return [string.lower()]
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(univ_pos, morphology):
|
if self.is_base_form(univ_pos, morphology):
|
||||||
return list(set([string.lower()]))
|
return [string.lower()]
|
||||||
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
|
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
|
||||||
self.exc.get(univ_pos, {}),
|
self.exc.get(univ_pos, {}),
|
||||||
self.rules.get(univ_pos, []))
|
self.rules.get(univ_pos, []))
|
||||||
|
@ -88,6 +90,7 @@ class Lemmatizer(object):
|
||||||
|
|
||||||
|
|
||||||
def lemmatize(string, index, exceptions, rules):
|
def lemmatize(string, index, exceptions, rules):
|
||||||
|
orig = string
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
forms.extend(exceptions.get(string, []))
|
forms.extend(exceptions.get(string, []))
|
||||||
|
@ -105,5 +108,5 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.extend(oov_forms)
|
forms.extend(oov_forms)
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.append(orig)
|
||||||
return list(set(forms))
|
return list(set(forms))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user