Add Spanish rule-based lemmatizer (#6833)

* Initial Spanish lemmatizer

* Handle merged verb+pron(s) multi-word tokens

* Use VERB for AUX rule lookup

* Add morph to lemma cache key

* Fix aux lookups, minor refactoring

* Improve verb+pron handling

* Move verb+pron handling into its own method
* Check for exceptions (primarily for se)
* Collect pronouns in the same (not reversed) order

* Only add modified possible lemmas
This commit is contained in:
Adriane Boyd 2021-01-27 12:21:35 +01:00 committed by GitHub
parent abb24fdc0f
commit d17afb4826
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 437 additions and 1 deletions

View File

@ -1,6 +1,9 @@
from typing import Optional
from thinc.api import Model
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .lemmatizer import SpanishLemmatizer
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
from ...language import Language from ...language import Language
@ -20,4 +23,14 @@ class Spanish(Language):
Defaults = SpanishDefaults Defaults = SpanishDefaults
@Spanish.factory(
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "overwrite": False},
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool):
return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
__all__ = ["Spanish"] __all__ = ["Spanish"]

423
spacy/lang/es/lemmatizer.py Normal file
View File

@ -0,0 +1,423 @@
from typing import List, Optional, Tuple
import re
from ...pipeline import Lemmatizer
from ...tokens import Token
class SpanishLemmatizer(Lemmatizer):
"""
Spanish rule-based lemmatizer with morph-based rule selection.
"""
@classmethod
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
if mode == "rule":
required = ["lemma_rules", "lemma_rules_groups", "lemma_index", "lemma_exc"]
return (required, [])
else:
return super().get_lookups_config(mode)
def rule_lemmatize(self, token: Token) -> List[str]:
cache_key = (token.orth, token.pos, str(token.morph))
if cache_key in self.cache:
return self.cache[cache_key]
string = token.text
pos = token.pos_.lower()
features = set(token.morph)
if pos in ("", "eol", "space"):
return [string.lower()]
if pos in (
"adp",
"cconj",
"intj",
"part",
"propn",
"punct",
"sconj",
"sym",
"x",
):
if token.is_sent_start and pos != "propn":
return [string.lower()]
else:
return [string]
string = string.lower()
exc = self.lookups.get_table("lemma_exc").get(pos, {}).get(string)
if exc is not None:
lemmas = list(exc)
else:
if pos == "aux":
rule_pos = "verb"
else:
rule_pos = pos
rule = self.select_rule(rule_pos, features)
index = self.lookups.get_table("lemma_index").get(rule_pos, [])
lemmas = getattr(self, "lemmatize_" + rule_pos)(
string, features, rule, index
)
# Remove duplicates but preserve the ordering
lemmas = list(dict.fromkeys(lemmas))
self.cache[cache_key] = lemmas
return lemmas
def select_rule(self, pos: str, features: List[str]) -> Optional[str]:
groups = self.lookups.get_table("lemma_rules_groups")
if pos in groups:
for group in groups[pos]:
if set(group[1]).issubset(features):
return group[0]
return None
def lemmatize_adj(
self, word: str, features: List[str], rule: str, index: List[str]
) -> List[str]:
"""
Lemmatize an adjective.
word (str): The word to lemmatize.
features (List[str]): The morphological features as a list of Feat=Val
pairs.
index (List[str]): The POS-specific lookup list.
RETURNS (List[str]): The list of lemmas.
"""
# Initialize empty lists for the generated lemmas
possible_lemmas = []
selected_lemmas = []
# Apply lemmatization rules
for old, new in self.lookups.get_table("lemma_rules").get(rule, []):
possible_lemma = re.sub(old + "$", new, word)
if possible_lemma != word:
possible_lemmas.append(possible_lemma)
# Additional rule for plurals that go from esdrújula to grave and end in
# 'n' or 's', e.g., jóvenes -> joven
additional_lemmas = []
if "Number=Plur" in features:
for possible_lemma in possible_lemmas:
if possible_lemma.endswith("n") or possible_lemma.endswith("s"):
for old, new in self.lookups.get_table("lemma_rules").get(
"accents", []
):
additional_lemmas.append(re.sub(old, new, possible_lemma))
possible_lemmas.extend(additional_lemmas)
for lemma in possible_lemmas:
if lemma in index:
selected_lemmas.append(lemma)
# If one or more of the created possible lemmas are in the lookup list,
# return all of them
if len(selected_lemmas) > 0:
return selected_lemmas
elif len(possible_lemmas) > 0:
return possible_lemmas
else:
return [word]
def lemmatize_adv(
self, word: str, features: List[str], rule: str, index: List[str]
) -> List[str]:
"""
Lemmatize an adverb.
word (str): The word to lemmatize.
features (List[str]): The morphological features as a list of Feat=Val
pairs.
index (List[str]): The POS-specific lookup list.
RETURNS (List[str]): The list of lemmas.
"""
# Apply lemmatization rules
for old, new in self.lookups.get_table("lemma_rules").get("adverbs", []):
if word == old:
return [new]
# If none of the rules applies, return the original word
return [word]
def lemmatize_det(
self, word: str, features: List[str], rule: str, index: List[str]
) -> List[str]:
"""
Lemmatize a determiner.
word (str): The word to lemmatize.
features (List[str]): The morphological features as a list of Feat=Val
pairs.
index (List[str]): The POS-specific lookup list.
RETURNS (List[str]): The list of lemmas.
"""
# Initialize empty lists for the generated lemmas
possible_lemmas = []
selected_lemmas = []
# First, search in rules specific to determiners
for old, new in self.lookups.get_table("lemma_rules").get("det", []):
if word == old:
return [new]
# If none of the specfic rules apply, search in the common rules for
# determiners and pronouns that follow a unique pattern for
# lemmatization. If the word is in the list, return the corresponding
# lemma.
for old, new in self.lookups.get_table("lemma_rules").get(
"det_and_pron_fixed", []
):
if word == old:
return [new]
# If the word is not in the list of unique determiners and pronouns,
# apply general rules of lemmatization. Include the original word in the # list of possible lemmas.
for old, new in self.lookups.get_table("lemma_rules").get(
"det_and_pron_general", []
):
possible_lemma = re.sub(old + "$", new, word)
possible_lemmas.append(possible_lemma)
possible_lemmas.append(word)
if len(possible_lemmas) == 1:
return possible_lemmas
elif len(possible_lemmas) > 1:
for lemma in possible_lemmas:
if lemma in index:
selected_lemmas.append(lemma)
if len(selected_lemmas) >= 1:
return selected_lemmas
else:
return possible_lemmas
def lemmatize_noun(
self, word: str, features: List[str], rule: str, index: List[str]
) -> List[str]:
"""
Lemmatize a noun.
word (str): The word to lemmatize.
features (List[str]): The morphological features as a list of Feat=Val
pairs.
index (List[str]): The POS-specific lookup list.
RETURNS (List[str]): The list of lemmas.
"""
# Initialize empty lists for the generated lemmas
possible_lemmas = []
selected_lemmas = []
# Apply lemmatization rules
for old, new in self.lookups.get_table("lemma_rules").get(rule, []):
possible_lemma = re.sub(old + "$", new, word)
if possible_lemma != word:
possible_lemmas.append(possible_lemma)
# Additional rule for plurals that go from esdrújula to grave and end in
# 'n' or 's', e.g., órdenes -> orden, exámenes -> examen
additional_lemmas = []
if "Number=Plur" in features:
for possible_lemma in possible_lemmas:
if possible_lemma.endswith("n") or possible_lemma.endswith("s"):
for old, new in self.lookups.get_table("lemma_rules").get(
"accents", []
):
additional_lemmas.append(re.sub(old, new, possible_lemma))
possible_lemmas.extend(additional_lemmas)
for lemma in possible_lemmas:
if lemma in index:
selected_lemmas.append(lemma)
# If one or more of the created possible lemmas are in the lookup list,
# return all of them
if len(selected_lemmas) > 0:
return selected_lemmas
elif len(possible_lemmas) > 0:
return possible_lemmas
else:
return [word]
def lemmatize_num(
self, word: str, features: List[str], rule: str, index: List[str]
) -> List[str]:
"""
Lemmatize a numeral.
word (str): The word to lemmatize.
features (List[str]): The morphological features as a list of Feat=Val
pairs.
index (List[str]): The POS-specific lookup list.
RETURNS (List[str]): The list of lemmas.
"""
# If the word is in the list of rules for numerals, return the
# corresponding lemma
for old, new in self.lookups.get_table("lemma_rules").get("num", []):
if word == old:
return [new]
# Normalize punctuation
splitted_word = word.split(",")
if re.search(r"(\.)([0-9]{3})$", splitted_word[0]):
word = re.sub(r"\.", r"", word)
word = re.sub(r",", r".", word)
return [word]
def lemmatize_pron(
self, word: str, features: List[str], rule: str, index: List[str]
) -> List[str]:
"""
Lemmatize a pronoun.
word (str): The word to lemmatize.
features (List[str]): The morphological features as a list of Feat=Val
pairs.
index (List[str]): The POS-specific lookup list.
RETURNS (List[str]): The list of lemmas.
"""
# Initialize empty lists for the generated lemmas
possible_lemmas = []
selected_lemmas = []
# First, search in rules specific to pronouns
for old, new in self.lookups.get_table("lemma_rules").get("pron", []):
if word == old:
return [new]
# If none of the specfic rules apply, search in the common rules for
# determiners and pronouns that follow a unique pattern for
# lemmatization. If the word is in the list, return the corresponding
# lemma.
for old, new in self.lookups.get_table("lemma_rules").get(
"det_and_pron_fixed", []
):
if word == old:
return [new]
# If the word is not in the list of unique determiners and pronouns,
# apply general rules of lemmatization. Include the original word in the
# list of possible lemmas.
for old, new in self.lookups.get_table("lemma_rules").get(
"det_and_pron_general", []
):
possible_lemma = re.sub(old + "$", new, word)
if possible_lemma != word:
possible_lemmas.append(possible_lemma)
possible_lemmas.append(word)
if len(possible_lemmas) == 1:
return possible_lemmas
elif len(possible_lemmas) > 1:
for lemma in possible_lemmas:
if lemma in index:
selected_lemmas.append(lemma)
if len(selected_lemmas) >= 1:
return selected_lemmas
else:
return possible_lemmas
def lemmatize_verb(
self, word: str, features: List[str], rule: str, index: List[str]
) -> List[str]:
"""
Lemmatize a verb.
word (str): The word to lemmatize.
features (List[str]): The morphological features as a list of Feat=Val
pairs.
index (List[str]): The POS-specific lookup list.
RETURNS (List[str]): The list of lemmas.
"""
# Exceptions for verb+pronoun(s)
if "PronType=Prs" in features:
return self.lemmatize_verb_pron(word, features, rule, index)
# Initialize empty lists for the generated lemmas
possible_lemmas = []
selected_lemmas = []
# Apply lemmatization rules
for old, new in self.lookups.get_table("lemma_rules").get(rule, []):
possible_lemma = re.sub(old + "$", new, word)
if possible_lemma != word:
possible_lemmas.append(possible_lemma)
for lemma in possible_lemmas:
if lemma in index:
selected_lemmas.append(lemma)
if len(selected_lemmas) == 0:
# If none of the possible lemmas are in the lookup list,
# apply vocalic alternation rules and search in the lookup list
# again
for lemma in possible_lemmas:
for old, new in self.lookups.get_table("lemma_rules").get(
"voc_alt_1", []
):
if old in lemma:
for i, char in enumerate(lemma):
if char == old:
voc_alt_lemma = lemma[:i] + new + lemma[i + 1 :]
if voc_alt_lemma in index:
selected_lemmas.append(voc_alt_lemma)
for old, new in self.lookups.get_table("lemma_rules").get(
"voc_alt_2", []
):
if old in lemma:
voc_alt_lemma = lemma.replace(old, new, 1)
if voc_alt_lemma in index:
selected_lemmas.append(voc_alt_lemma)
# Additional rule for verbs that lose the accent mark when lemmatized,
# e.g., amplían -> ampliar
additional_lemmas = []
for possible_lemma in possible_lemmas:
for old, new in self.lookups.get_table("lemma_rules").get("accents", []):
additional_lemmas.append(re.sub(old, new, possible_lemma))
possible_lemmas.extend(additional_lemmas)
# If one or more of the created possible lemmas are in the lookup list,
# return all of them
if len(selected_lemmas) > 0:
return selected_lemmas
elif len(possible_lemmas) > 0:
return possible_lemmas
else:
return [word]
def lemmatize_verb_pron(
self, word: str, features: List[str], rule: str, index: List[str]
) -> List[str]:
# Strip and collect pronouns
pron_patt = "^(.*?)([mts]e|l[aeo]s?|n?os)$"
prons = []
verb = word
m = re.search(pron_patt, verb)
while m is not None and len(prons) <= 3:
verb = re.sub(m.group(2) + "$", "", verb)
prons = [m.group(2)] + prons
m = re.search(pron_patt, verb)
# Strip accents from verb form
for old, new in self.lookups.get_table("lemma_rules").get("accents", []):
verb = re.sub(old, new, verb)
# Lemmatize the verb and pronouns, checking for exceptions
exc = self.lookups.get_table("lemma_exc").get("verb", {}).get(verb)
if exc is not None:
verb_lemma = exc[0]
else:
rule = self.select_rule("verb", features)
verb_lemma = self.lemmatize_verb(
verb, features - {"PronType=Prs"}, rule, index
)[0]
pron_lemmas = []
for pron in prons:
exc = self.lookups.get_table("lemma_exc").get("pron", {}).get(pron)
if exc is not None:
pron_lemmas.append(exc[0])
else:
rule = self.select_rule("pron", features)
pron_lemmas.append(self.lemmatize_pron(pron, features, rule, index)[0])
return [verb_lemma + " " + " ".join(pron_lemmas)]

View File

@ -7,7 +7,7 @@ from spacy.util import get_lang_class
# fmt: off # fmt: off
# Only include languages with no external dependencies # Only include languages with no external dependencies
# excluded: ru, uk # excluded: ru, uk
# excluded for custom tables: pl # excluded for custom tables: es, pl
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"] LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
# fmt: on # fmt: on