mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
Add Spanish rule-based lemmatizer (#6833)
* Initial Spanish lemmatizer * Handle merged verb+pron(s) multi-word tokens * Use VERB for AUX rule lookup * Add morph to lemma cache key * Fix aux lookups, minor refactoring * Improve verb+pron handling * Move verb+pron handling into its own method * Check for exceptions (primarily for se) * Collect pronouns in the same (not reversed) order * Only add modified possible lemmas
This commit is contained in:
parent
abb24fdc0f
commit
d17afb4826
|
@ -1,6 +1,9 @@
|
||||||
|
from typing import Optional
|
||||||
|
from thinc.api import Model
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
from .lemmatizer import SpanishLemmatizer
|
||||||
from .syntax_iterators import SYNTAX_ITERATORS
|
from .syntax_iterators import SYNTAX_ITERATORS
|
||||||
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
|
@ -20,4 +23,14 @@ class Spanish(Language):
|
||||||
Defaults = SpanishDefaults
|
Defaults = SpanishDefaults
|
||||||
|
|
||||||
|
|
||||||
|
@Spanish.factory(
|
||||||
|
"lemmatizer",
|
||||||
|
assigns=["token.lemma"],
|
||||||
|
default_config={"model": None, "mode": "rule", "overwrite": False},
|
||||||
|
default_score_weights={"lemma_acc": 1.0},
|
||||||
|
)
|
||||||
|
def make_lemmatizer(nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool):
|
||||||
|
return SpanishLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["Spanish"]
|
__all__ = ["Spanish"]
|
||||||
|
|
423
spacy/lang/es/lemmatizer.py
Normal file
423
spacy/lang/es/lemmatizer.py
Normal file
|
@ -0,0 +1,423 @@
|
||||||
|
from typing import List, Optional, Tuple
|
||||||
|
import re
|
||||||
|
|
||||||
|
from ...pipeline import Lemmatizer
|
||||||
|
from ...tokens import Token
|
||||||
|
|
||||||
|
|
||||||
|
class SpanishLemmatizer(Lemmatizer):
|
||||||
|
"""
|
||||||
|
Spanish rule-based lemmatizer with morph-based rule selection.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
||||||
|
if mode == "rule":
|
||||||
|
required = ["lemma_rules", "lemma_rules_groups", "lemma_index", "lemma_exc"]
|
||||||
|
return (required, [])
|
||||||
|
else:
|
||||||
|
return super().get_lookups_config(mode)
|
||||||
|
|
||||||
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
||||||
|
cache_key = (token.orth, token.pos, str(token.morph))
|
||||||
|
if cache_key in self.cache:
|
||||||
|
return self.cache[cache_key]
|
||||||
|
string = token.text
|
||||||
|
pos = token.pos_.lower()
|
||||||
|
features = set(token.morph)
|
||||||
|
if pos in ("", "eol", "space"):
|
||||||
|
return [string.lower()]
|
||||||
|
if pos in (
|
||||||
|
"adp",
|
||||||
|
"cconj",
|
||||||
|
"intj",
|
||||||
|
"part",
|
||||||
|
"propn",
|
||||||
|
"punct",
|
||||||
|
"sconj",
|
||||||
|
"sym",
|
||||||
|
"x",
|
||||||
|
):
|
||||||
|
if token.is_sent_start and pos != "propn":
|
||||||
|
return [string.lower()]
|
||||||
|
else:
|
||||||
|
return [string]
|
||||||
|
|
||||||
|
string = string.lower()
|
||||||
|
exc = self.lookups.get_table("lemma_exc").get(pos, {}).get(string)
|
||||||
|
if exc is not None:
|
||||||
|
lemmas = list(exc)
|
||||||
|
else:
|
||||||
|
if pos == "aux":
|
||||||
|
rule_pos = "verb"
|
||||||
|
else:
|
||||||
|
rule_pos = pos
|
||||||
|
rule = self.select_rule(rule_pos, features)
|
||||||
|
index = self.lookups.get_table("lemma_index").get(rule_pos, [])
|
||||||
|
lemmas = getattr(self, "lemmatize_" + rule_pos)(
|
||||||
|
string, features, rule, index
|
||||||
|
)
|
||||||
|
# Remove duplicates but preserve the ordering
|
||||||
|
lemmas = list(dict.fromkeys(lemmas))
|
||||||
|
|
||||||
|
self.cache[cache_key] = lemmas
|
||||||
|
return lemmas
|
||||||
|
|
||||||
|
def select_rule(self, pos: str, features: List[str]) -> Optional[str]:
|
||||||
|
groups = self.lookups.get_table("lemma_rules_groups")
|
||||||
|
if pos in groups:
|
||||||
|
for group in groups[pos]:
|
||||||
|
if set(group[1]).issubset(features):
|
||||||
|
return group[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def lemmatize_adj(
|
||||||
|
self, word: str, features: List[str], rule: str, index: List[str]
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
Lemmatize an adjective.
|
||||||
|
|
||||||
|
word (str): The word to lemmatize.
|
||||||
|
features (List[str]): The morphological features as a list of Feat=Val
|
||||||
|
pairs.
|
||||||
|
index (List[str]): The POS-specific lookup list.
|
||||||
|
|
||||||
|
RETURNS (List[str]): The list of lemmas.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Initialize empty lists for the generated lemmas
|
||||||
|
possible_lemmas = []
|
||||||
|
selected_lemmas = []
|
||||||
|
|
||||||
|
# Apply lemmatization rules
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get(rule, []):
|
||||||
|
possible_lemma = re.sub(old + "$", new, word)
|
||||||
|
if possible_lemma != word:
|
||||||
|
possible_lemmas.append(possible_lemma)
|
||||||
|
|
||||||
|
# Additional rule for plurals that go from esdrújula to grave and end in
|
||||||
|
# 'n' or 's', e.g., jóvenes -> joven
|
||||||
|
additional_lemmas = []
|
||||||
|
if "Number=Plur" in features:
|
||||||
|
for possible_lemma in possible_lemmas:
|
||||||
|
if possible_lemma.endswith("n") or possible_lemma.endswith("s"):
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get(
|
||||||
|
"accents", []
|
||||||
|
):
|
||||||
|
additional_lemmas.append(re.sub(old, new, possible_lemma))
|
||||||
|
possible_lemmas.extend(additional_lemmas)
|
||||||
|
|
||||||
|
for lemma in possible_lemmas:
|
||||||
|
if lemma in index:
|
||||||
|
selected_lemmas.append(lemma)
|
||||||
|
# If one or more of the created possible lemmas are in the lookup list,
|
||||||
|
# return all of them
|
||||||
|
if len(selected_lemmas) > 0:
|
||||||
|
return selected_lemmas
|
||||||
|
elif len(possible_lemmas) > 0:
|
||||||
|
return possible_lemmas
|
||||||
|
else:
|
||||||
|
return [word]
|
||||||
|
|
||||||
|
def lemmatize_adv(
|
||||||
|
self, word: str, features: List[str], rule: str, index: List[str]
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
Lemmatize an adverb.
|
||||||
|
|
||||||
|
word (str): The word to lemmatize.
|
||||||
|
features (List[str]): The morphological features as a list of Feat=Val
|
||||||
|
pairs.
|
||||||
|
index (List[str]): The POS-specific lookup list.
|
||||||
|
|
||||||
|
RETURNS (List[str]): The list of lemmas.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Apply lemmatization rules
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get("adverbs", []):
|
||||||
|
if word == old:
|
||||||
|
return [new]
|
||||||
|
|
||||||
|
# If none of the rules applies, return the original word
|
||||||
|
return [word]
|
||||||
|
|
||||||
|
def lemmatize_det(
|
||||||
|
self, word: str, features: List[str], rule: str, index: List[str]
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
Lemmatize a determiner.
|
||||||
|
|
||||||
|
word (str): The word to lemmatize.
|
||||||
|
features (List[str]): The morphological features as a list of Feat=Val
|
||||||
|
pairs.
|
||||||
|
index (List[str]): The POS-specific lookup list.
|
||||||
|
|
||||||
|
RETURNS (List[str]): The list of lemmas.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Initialize empty lists for the generated lemmas
|
||||||
|
possible_lemmas = []
|
||||||
|
selected_lemmas = []
|
||||||
|
|
||||||
|
# First, search in rules specific to determiners
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get("det", []):
|
||||||
|
if word == old:
|
||||||
|
return [new]
|
||||||
|
# If none of the specfic rules apply, search in the common rules for
|
||||||
|
# determiners and pronouns that follow a unique pattern for
|
||||||
|
# lemmatization. If the word is in the list, return the corresponding
|
||||||
|
# lemma.
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get(
|
||||||
|
"det_and_pron_fixed", []
|
||||||
|
):
|
||||||
|
if word == old:
|
||||||
|
return [new]
|
||||||
|
# If the word is not in the list of unique determiners and pronouns,
|
||||||
|
# apply general rules of lemmatization. Include the original word in the # list of possible lemmas.
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get(
|
||||||
|
"det_and_pron_general", []
|
||||||
|
):
|
||||||
|
possible_lemma = re.sub(old + "$", new, word)
|
||||||
|
possible_lemmas.append(possible_lemma)
|
||||||
|
possible_lemmas.append(word)
|
||||||
|
|
||||||
|
if len(possible_lemmas) == 1:
|
||||||
|
return possible_lemmas
|
||||||
|
elif len(possible_lemmas) > 1:
|
||||||
|
for lemma in possible_lemmas:
|
||||||
|
if lemma in index:
|
||||||
|
selected_lemmas.append(lemma)
|
||||||
|
if len(selected_lemmas) >= 1:
|
||||||
|
return selected_lemmas
|
||||||
|
else:
|
||||||
|
return possible_lemmas
|
||||||
|
|
||||||
|
def lemmatize_noun(
|
||||||
|
self, word: str, features: List[str], rule: str, index: List[str]
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
Lemmatize a noun.
|
||||||
|
|
||||||
|
word (str): The word to lemmatize.
|
||||||
|
features (List[str]): The morphological features as a list of Feat=Val
|
||||||
|
pairs.
|
||||||
|
index (List[str]): The POS-specific lookup list.
|
||||||
|
|
||||||
|
RETURNS (List[str]): The list of lemmas.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Initialize empty lists for the generated lemmas
|
||||||
|
possible_lemmas = []
|
||||||
|
selected_lemmas = []
|
||||||
|
|
||||||
|
# Apply lemmatization rules
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get(rule, []):
|
||||||
|
possible_lemma = re.sub(old + "$", new, word)
|
||||||
|
if possible_lemma != word:
|
||||||
|
possible_lemmas.append(possible_lemma)
|
||||||
|
|
||||||
|
# Additional rule for plurals that go from esdrújula to grave and end in
|
||||||
|
# 'n' or 's', e.g., órdenes -> orden, exámenes -> examen
|
||||||
|
additional_lemmas = []
|
||||||
|
if "Number=Plur" in features:
|
||||||
|
for possible_lemma in possible_lemmas:
|
||||||
|
if possible_lemma.endswith("n") or possible_lemma.endswith("s"):
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get(
|
||||||
|
"accents", []
|
||||||
|
):
|
||||||
|
additional_lemmas.append(re.sub(old, new, possible_lemma))
|
||||||
|
possible_lemmas.extend(additional_lemmas)
|
||||||
|
|
||||||
|
for lemma in possible_lemmas:
|
||||||
|
if lemma in index:
|
||||||
|
selected_lemmas.append(lemma)
|
||||||
|
# If one or more of the created possible lemmas are in the lookup list,
|
||||||
|
# return all of them
|
||||||
|
if len(selected_lemmas) > 0:
|
||||||
|
return selected_lemmas
|
||||||
|
elif len(possible_lemmas) > 0:
|
||||||
|
return possible_lemmas
|
||||||
|
else:
|
||||||
|
return [word]
|
||||||
|
|
||||||
|
def lemmatize_num(
|
||||||
|
self, word: str, features: List[str], rule: str, index: List[str]
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
Lemmatize a numeral.
|
||||||
|
|
||||||
|
word (str): The word to lemmatize.
|
||||||
|
features (List[str]): The morphological features as a list of Feat=Val
|
||||||
|
pairs.
|
||||||
|
index (List[str]): The POS-specific lookup list.
|
||||||
|
|
||||||
|
RETURNS (List[str]): The list of lemmas.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# If the word is in the list of rules for numerals, return the
|
||||||
|
# corresponding lemma
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get("num", []):
|
||||||
|
if word == old:
|
||||||
|
return [new]
|
||||||
|
|
||||||
|
# Normalize punctuation
|
||||||
|
splitted_word = word.split(",")
|
||||||
|
if re.search(r"(\.)([0-9]{3})$", splitted_word[0]):
|
||||||
|
word = re.sub(r"\.", r"", word)
|
||||||
|
word = re.sub(r",", r".", word)
|
||||||
|
return [word]
|
||||||
|
|
||||||
|
def lemmatize_pron(
|
||||||
|
self, word: str, features: List[str], rule: str, index: List[str]
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
Lemmatize a pronoun.
|
||||||
|
|
||||||
|
word (str): The word to lemmatize.
|
||||||
|
features (List[str]): The morphological features as a list of Feat=Val
|
||||||
|
pairs.
|
||||||
|
index (List[str]): The POS-specific lookup list.
|
||||||
|
|
||||||
|
RETURNS (List[str]): The list of lemmas.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Initialize empty lists for the generated lemmas
|
||||||
|
possible_lemmas = []
|
||||||
|
selected_lemmas = []
|
||||||
|
|
||||||
|
# First, search in rules specific to pronouns
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get("pron", []):
|
||||||
|
if word == old:
|
||||||
|
return [new]
|
||||||
|
# If none of the specfic rules apply, search in the common rules for
|
||||||
|
# determiners and pronouns that follow a unique pattern for
|
||||||
|
# lemmatization. If the word is in the list, return the corresponding
|
||||||
|
# lemma.
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get(
|
||||||
|
"det_and_pron_fixed", []
|
||||||
|
):
|
||||||
|
if word == old:
|
||||||
|
return [new]
|
||||||
|
# If the word is not in the list of unique determiners and pronouns,
|
||||||
|
# apply general rules of lemmatization. Include the original word in the
|
||||||
|
# list of possible lemmas.
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get(
|
||||||
|
"det_and_pron_general", []
|
||||||
|
):
|
||||||
|
possible_lemma = re.sub(old + "$", new, word)
|
||||||
|
if possible_lemma != word:
|
||||||
|
possible_lemmas.append(possible_lemma)
|
||||||
|
possible_lemmas.append(word)
|
||||||
|
|
||||||
|
if len(possible_lemmas) == 1:
|
||||||
|
return possible_lemmas
|
||||||
|
elif len(possible_lemmas) > 1:
|
||||||
|
for lemma in possible_lemmas:
|
||||||
|
if lemma in index:
|
||||||
|
selected_lemmas.append(lemma)
|
||||||
|
if len(selected_lemmas) >= 1:
|
||||||
|
return selected_lemmas
|
||||||
|
else:
|
||||||
|
return possible_lemmas
|
||||||
|
|
||||||
|
def lemmatize_verb(
|
||||||
|
self, word: str, features: List[str], rule: str, index: List[str]
|
||||||
|
) -> List[str]:
|
||||||
|
"""
|
||||||
|
Lemmatize a verb.
|
||||||
|
|
||||||
|
word (str): The word to lemmatize.
|
||||||
|
features (List[str]): The morphological features as a list of Feat=Val
|
||||||
|
pairs.
|
||||||
|
index (List[str]): The POS-specific lookup list.
|
||||||
|
|
||||||
|
RETURNS (List[str]): The list of lemmas.
|
||||||
|
"""
|
||||||
|
# Exceptions for verb+pronoun(s)
|
||||||
|
if "PronType=Prs" in features:
|
||||||
|
return self.lemmatize_verb_pron(word, features, rule, index)
|
||||||
|
|
||||||
|
# Initialize empty lists for the generated lemmas
|
||||||
|
possible_lemmas = []
|
||||||
|
selected_lemmas = []
|
||||||
|
|
||||||
|
# Apply lemmatization rules
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get(rule, []):
|
||||||
|
possible_lemma = re.sub(old + "$", new, word)
|
||||||
|
if possible_lemma != word:
|
||||||
|
possible_lemmas.append(possible_lemma)
|
||||||
|
|
||||||
|
for lemma in possible_lemmas:
|
||||||
|
if lemma in index:
|
||||||
|
selected_lemmas.append(lemma)
|
||||||
|
if len(selected_lemmas) == 0:
|
||||||
|
# If none of the possible lemmas are in the lookup list,
|
||||||
|
# apply vocalic alternation rules and search in the lookup list
|
||||||
|
# again
|
||||||
|
for lemma in possible_lemmas:
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get(
|
||||||
|
"voc_alt_1", []
|
||||||
|
):
|
||||||
|
if old in lemma:
|
||||||
|
for i, char in enumerate(lemma):
|
||||||
|
if char == old:
|
||||||
|
voc_alt_lemma = lemma[:i] + new + lemma[i + 1 :]
|
||||||
|
if voc_alt_lemma in index:
|
||||||
|
selected_lemmas.append(voc_alt_lemma)
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get(
|
||||||
|
"voc_alt_2", []
|
||||||
|
):
|
||||||
|
if old in lemma:
|
||||||
|
voc_alt_lemma = lemma.replace(old, new, 1)
|
||||||
|
if voc_alt_lemma in index:
|
||||||
|
selected_lemmas.append(voc_alt_lemma)
|
||||||
|
# Additional rule for verbs that lose the accent mark when lemmatized,
|
||||||
|
# e.g., amplían -> ampliar
|
||||||
|
additional_lemmas = []
|
||||||
|
for possible_lemma in possible_lemmas:
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get("accents", []):
|
||||||
|
additional_lemmas.append(re.sub(old, new, possible_lemma))
|
||||||
|
possible_lemmas.extend(additional_lemmas)
|
||||||
|
|
||||||
|
# If one or more of the created possible lemmas are in the lookup list,
|
||||||
|
# return all of them
|
||||||
|
if len(selected_lemmas) > 0:
|
||||||
|
return selected_lemmas
|
||||||
|
elif len(possible_lemmas) > 0:
|
||||||
|
return possible_lemmas
|
||||||
|
else:
|
||||||
|
return [word]
|
||||||
|
|
||||||
|
def lemmatize_verb_pron(
|
||||||
|
self, word: str, features: List[str], rule: str, index: List[str]
|
||||||
|
) -> List[str]:
|
||||||
|
# Strip and collect pronouns
|
||||||
|
pron_patt = "^(.*?)([mts]e|l[aeo]s?|n?os)$"
|
||||||
|
prons = []
|
||||||
|
verb = word
|
||||||
|
m = re.search(pron_patt, verb)
|
||||||
|
while m is not None and len(prons) <= 3:
|
||||||
|
verb = re.sub(m.group(2) + "$", "", verb)
|
||||||
|
prons = [m.group(2)] + prons
|
||||||
|
m = re.search(pron_patt, verb)
|
||||||
|
# Strip accents from verb form
|
||||||
|
for old, new in self.lookups.get_table("lemma_rules").get("accents", []):
|
||||||
|
verb = re.sub(old, new, verb)
|
||||||
|
# Lemmatize the verb and pronouns, checking for exceptions
|
||||||
|
exc = self.lookups.get_table("lemma_exc").get("verb", {}).get(verb)
|
||||||
|
if exc is not None:
|
||||||
|
verb_lemma = exc[0]
|
||||||
|
else:
|
||||||
|
rule = self.select_rule("verb", features)
|
||||||
|
verb_lemma = self.lemmatize_verb(
|
||||||
|
verb, features - {"PronType=Prs"}, rule, index
|
||||||
|
)[0]
|
||||||
|
pron_lemmas = []
|
||||||
|
for pron in prons:
|
||||||
|
exc = self.lookups.get_table("lemma_exc").get("pron", {}).get(pron)
|
||||||
|
if exc is not None:
|
||||||
|
pron_lemmas.append(exc[0])
|
||||||
|
else:
|
||||||
|
rule = self.select_rule("pron", features)
|
||||||
|
pron_lemmas.append(self.lemmatize_pron(pron, features, rule, index)[0])
|
||||||
|
return [verb_lemma + " " + " ".join(pron_lemmas)]
|
|
@ -7,7 +7,7 @@ from spacy.util import get_lang_class
|
||||||
# fmt: off
|
# fmt: off
|
||||||
# Only include languages with no external dependencies
|
# Only include languages with no external dependencies
|
||||||
# excluded: ru, uk
|
# excluded: ru, uk
|
||||||
# excluded for custom tables: pl
|
# excluded for custom tables: es, pl
|
||||||
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user