mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Fix lemmatizer
This commit is contained in:
parent
07776d8096
commit
a2f3510d6d
|
@ -47,18 +47,20 @@ class Lemmatizer(object):
|
||||||
elif univ_pos == PUNCT:
|
elif univ_pos == PUNCT:
|
||||||
univ_pos = 'punct'
|
univ_pos = 'punct'
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(pos, **morphology):
|
if self.is_base_form(univ_pos, **morphology):
|
||||||
return set([string.lower()])
|
return set([string.lower()])
|
||||||
lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, []))
|
lemmas = lemmatize(string, self.index.get(univ_pos, {}),
|
||||||
|
self.exc.get(univ_pos, {}),
|
||||||
|
self.rules.get(univ_pos, []))
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
def is_base_form(self, pos, **morphology):
|
def is_base_form(self, univ_pos, **morphology):
|
||||||
'''Check whether we're dealing with an uninflected paradigm, so we can
|
'''Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
avoid lemmatization entirely.'''
|
avoid lemmatization entirely.'''
|
||||||
others = [key for key in morphology if key not in ('number', 'pos', 'verbform')]
|
others = [key for key in morphology if key not in ('number', 'pos', 'verbform')]
|
||||||
if pos == 'noun' and morphology.get('number') == 'sing' and not others:
|
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
|
||||||
return True
|
return True
|
||||||
elif pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
|
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
Loading…
Reference in New Issue
Block a user