mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
b98d216205
* Update Catalan language data Update Catalan language data based on contributions from the Text Mining Unit at the Barcelona Supercomputing Center: https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data * Update tokenizer settings for UD Catalan AnCora Update for UD Catalan AnCora v2.7 with merged multi-word tokens. * Update test * Move prefix patternt to more generic infix pattern * Clean up
82 lines
2.8 KiB
Python
82 lines
2.8 KiB
Python
from typing import List, Tuple
|
|
|
|
from ...pipeline import Lemmatizer
|
|
from ...tokens import Token
|
|
|
|
|
|
class CatalanLemmatizer(Lemmatizer):
|
|
"""
|
|
Copied from French Lemmatizer
|
|
Catalan language lemmatizer applies the default rule based lemmatization
|
|
procedure with some modifications for better Catalan language support.
|
|
|
|
The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
|
|
the rule-based lemmatization. As a last resort, the lemmatizer checks in
|
|
the lookup table.
|
|
"""
|
|
|
|
@classmethod
|
|
def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
|
|
if mode == "rule":
|
|
required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
|
|
return (required, [])
|
|
else:
|
|
return super().get_lookups_config(mode)
|
|
|
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
|
cache_key = (token.orth, token.pos)
|
|
if cache_key in self.cache:
|
|
return self.cache[cache_key]
|
|
string = token.text
|
|
univ_pos = token.pos_.lower()
|
|
if univ_pos in ("", "eol", "space"):
|
|
return [string.lower()]
|
|
elif "lemma_rules" not in self.lookups or univ_pos not in (
|
|
"noun",
|
|
"verb",
|
|
"adj",
|
|
"adp",
|
|
"adv",
|
|
"aux",
|
|
"cconj",
|
|
"det",
|
|
"pron",
|
|
"punct",
|
|
"sconj",
|
|
):
|
|
return self.lookup_lemmatize(token)
|
|
index_table = self.lookups.get_table("lemma_index", {})
|
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
|
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
|
index = index_table.get(univ_pos, {})
|
|
exceptions = exc_table.get(univ_pos, {})
|
|
rules = rules_table.get(univ_pos, [])
|
|
string = string.lower()
|
|
forms = []
|
|
if string in index:
|
|
forms.append(string)
|
|
self.cache[cache_key] = forms
|
|
return forms
|
|
forms.extend(exceptions.get(string, []))
|
|
oov_forms = []
|
|
if not forms:
|
|
for old, new in rules:
|
|
if string.endswith(old):
|
|
form = string[: len(string) - len(old)] + new
|
|
if not form:
|
|
pass
|
|
elif form in index or not form.isalpha():
|
|
forms.append(form)
|
|
else:
|
|
oov_forms.append(form)
|
|
if not forms:
|
|
forms.extend(oov_forms)
|
|
if not forms and string in lookup_table.keys():
|
|
forms.append(self.lookup_lemmatize(token)[0])
|
|
if not forms:
|
|
forms.append(string)
|
|
forms = list(set(forms))
|
|
self.cache[cache_key] = forms
|
|
return forms
|