mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-01 12:56:29 +03:00
aafee5e1b7
* Fix lookup usage (fix #11347) Before using the lookups table in the French (and Catalan) lemmatizers, there's a check to see if the current term is in the table. But it's checking a string against hashes, so it's always false. Also the table lookup function is designed so you don't have to do that anyway. * Use the lookup table directly * Use string, not token
82 lines
2.8 KiB
Python
82 lines
2.8 KiB
Python
from typing import List, Tuple
|
|
|
|
from ...pipeline import Lemmatizer
|
|
from ...tokens import Token
|
|
|
|
|
|
class CatalanLemmatizer(Lemmatizer):
|
|
"""
|
|
Copied from French Lemmatizer
|
|
Catalan language lemmatizer applies the default rule based lemmatization
|
|
procedure with some modifications for better Catalan language support.
|
|
|
|
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
|
|
the rule-based lemmatization. As a last resort, the lemmatizer checks in
|
|
the lookup table.
|
|
"""
|
|
|
|
@classmethod
|
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
|
if mode == "rule":
|
|
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
|
return (required, [])
|
|
else:
|
|
return super().get_lookups_config(mode)
|
|
|
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
|
cache_key = (token.orth, token.pos)
|
|
if cache_key in self.cache:
|
|
return self.cache[cache_key]
|
|
string = token.text
|
|
univ_pos = token.pos_.lower()
|
|
if univ_pos in ("", "eol", "space"):
|
|
return [string.lower()]
|
|
elif "lemma_rules" not in self.lookups or univ_pos not in (
|
|
"noun",
|
|
"verb",
|
|
"adj",
|
|
"adp",
|
|
"adv",
|
|
"aux",
|
|
"cconj",
|
|
"det",
|
|
"pron",
|
|
"punct",
|
|
"sconj",
|
|
):
|
|
return self.lookup_lemmatize(token)
|
|
index_table = self.lookups.get_table("lemma_index", {})
|
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
|
index = index_table.get(univ_pos, {})
|
|
exceptions = exc_table.get(univ_pos, {})
|
|
rules = rules_table.get(univ_pos, [])
|
|
string = string.lower()
|
|
forms = []
|
|
if string in index:
|
|
forms.append(string)
|
|
self.cache[cache_key] = forms
|
|
return forms
|
|
forms.extend(exceptions.get(string, []))
|
|
oov_forms = []
|
|
if not forms:
|
|
for old, new in rules:
|
|
if string.endswith(old):
|
|
form = string[: len(string) - len(old)] + new
|
|
if not form:
|
|
pass
|
|
elif form in index or not form.isalpha():
|
|
forms.append(form)
|
|
else:
|
|
oov_forms.append(form)
|
|
if not forms:
|
|
forms.extend(oov_forms)
|
|
|
|
# use lookups, and fall back to the token itself
|
|
if not forms:
|
|
forms.append(lookup_table.get(string, [string])[0])
|
|
forms = list(dict.fromkeys(forms))
|
|
self.cache[cache_key] = forms
|
|
return forms
|