mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Merge pull request #5462 from adrianeboyd/feature/lemmatizer-all-upos
Extend lemmatizer rules for all UPOS tags
This commit is contained in:
commit
26cd6a0229
|
@ -6,6 +6,7 @@ from collections import OrderedDict
|
||||||
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
|
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
from .lookups import Lookups
|
from .lookups import Lookups
|
||||||
|
from .parts_of_speech import NAMES as UPOS_NAMES
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
|
@ -43,17 +44,11 @@ class Lemmatizer(object):
|
||||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||||
if "lemma_rules" not in self.lookups:
|
if "lemma_rules" not in self.lookups:
|
||||||
return [lookup_table.get(string, string)]
|
return [lookup_table.get(string, string)]
|
||||||
if univ_pos in (NOUN, "NOUN", "noun"):
|
if isinstance(univ_pos, int):
|
||||||
univ_pos = "noun"
|
univ_pos = UPOS_NAMES.get(univ_pos, "X")
|
||||||
elif univ_pos in (VERB, "VERB", "verb"):
|
univ_pos = univ_pos.lower()
|
||||||
univ_pos = "verb"
|
|
||||||
elif univ_pos in (ADJ, "ADJ", "adj"):
|
if univ_pos in ("", "eol", "space"):
|
||||||
univ_pos = "adj"
|
|
||||||
elif univ_pos in (PUNCT, "PUNCT", "punct"):
|
|
||||||
univ_pos = "punct"
|
|
||||||
elif univ_pos in (PROPN, "PROPN"):
|
|
||||||
return [string]
|
|
||||||
else:
|
|
||||||
return [string.lower()]
|
return [string.lower()]
|
||||||
# See Issue #435 for example of where this logic is requied.
|
# See Issue #435 for example of where this logic is requied.
|
||||||
if self.is_base_form(univ_pos, morphology):
|
if self.is_base_form(univ_pos, morphology):
|
||||||
|
@ -61,6 +56,11 @@ class Lemmatizer(object):
|
||||||
index_table = self.lookups.get_table("lemma_index", {})
|
index_table = self.lookups.get_table("lemma_index", {})
|
||||||
exc_table = self.lookups.get_table("lemma_exc", {})
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
||||||
rules_table = self.lookups.get_table("lemma_rules", {})
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
||||||
|
if not any((index_table.get(univ_pos), exc_table.get(univ_pos), rules_table.get(univ_pos))):
|
||||||
|
if univ_pos == "propn":
|
||||||
|
return [string]
|
||||||
|
else:
|
||||||
|
return [string.lower()]
|
||||||
lemmas = self.lemmatize(
|
lemmas = self.lemmatize(
|
||||||
string,
|
string,
|
||||||
index_table.get(univ_pos, {}),
|
index_table.get(univ_pos, {}),
|
||||||
|
|
Loading…
Reference in New Issue
Block a user