spaCy/spacy/lang/ca/lemmatizer.py

from typing import List, Tuple

from ...pipeline import Lemmatizer
from ...tokens import Token


class CatalanLemmatizer(Lemmatizer):
    """
    Copied from French Lemmatizer
    Catalan language lemmatizer applies the default rule based lemmatization
    procedure with some modifications for better Catalan language support.

    The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
    the rule-based lemmatization. As a last resort, the lemmatizer checks in
    the lookup table.
    """

    @classmethod
    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
        if mode == "rule":
            required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
            return (required, [])
        else:
            return super().get_lookups_config(mode)

    def rule_lemmatize(self, token: Token) -> List[str]:
        cache_key = (token.orth, token.pos)
        if cache_key in self.cache:
            return self.cache[cache_key]
        string = token.text
        univ_pos = token.pos_.lower()
        if univ_pos in ("", "eol", "space"):
            return [string.lower()]
        elif "lemma_rules" not in self.lookups or univ_pos not in (
            "noun",
            "verb",
            "adj",
            "adp",
            "adv",
            "aux",
            "cconj",
            "det",
            "pron",
            "punct",
            "sconj",
        ):
            return self.lookup_lemmatize(token)
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        index = index_table.get(univ_pos, {})
        exceptions = exc_table.get(univ_pos, {})
        rules = rules_table.get(univ_pos, [])
        string = string.lower()
        forms = []
        if string in index:
            forms.append(string)
            self.cache[cache_key] = forms
            return forms
        forms.extend(exceptions.get(string, []))
        oov_forms = []
        if not forms:
            for old, new in rules:
                if string.endswith(old):
                    form = string[: len(string) - len(old)] + new
                    if not form:
                        pass
                    elif form in index or not form.isalpha():
                        forms.append(form)
                    else:
                        oov_forms.append(form)
        if not forms:
            forms.extend(oov_forms)

        # use lookups, and fall back to the token itself
        if not forms:
            forms.append(lookup_table.get(string, [string])[0])
        forms = list(dict.fromkeys(forms))
        self.cache[cache_key] = forms
        return forms
Update Catalan language data (#8308) * Update Catalan language data Update Catalan language data based on contributions from the Text Mining Unit at the Barcelona Supercomputing Center: https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data * Update tokenizer settings for UD Catalan AnCora Update for UD Catalan AnCora v2.7 with merged multi-word tokens. * Update test * Move prefix patternt to more generic infix pattern * Clean up 2021-06-11 11:21:22 +03:00			`from typing import List, Tuple`

			`from ...pipeline import Lemmatizer`
			`from ...tokens import Token`


			`class CatalanLemmatizer(Lemmatizer):`
			`"""`
			`Copied from French Lemmatizer`
			`Catalan language lemmatizer applies the default rule based lemmatization`
			`procedure with some modifications for better Catalan language support.`

			`The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use`
			`the rule-based lemmatization. As a last resort, the lemmatizer checks in`
			`the lookup table.`
			`"""`

			`@classmethod`
			`def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:`
			`if mode == "rule":`
			`required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]`
			`return (required, [])`
			`else:`
			`return super().get_lookups_config(mode)`

			`def rule_lemmatize(self, token: Token) -> List[str]:`
			`cache_key = (token.orth, token.pos)`
			`if cache_key in self.cache:`
			`return self.cache[cache_key]`
			`string = token.text`
			`univ_pos = token.pos_.lower()`
			`if univ_pos in ("", "eol", "space"):`
			`return [string.lower()]`
			`elif "lemma_rules" not in self.lookups or univ_pos not in (`
			`"noun",`
			`"verb",`
			`"adj",`
			`"adp",`
			`"adv",`
			`"aux",`
			`"cconj",`
			`"det",`
			`"pron",`
			`"punct",`
			`"sconj",`
			`):`
			`return self.lookup_lemmatize(token)`
			`index_table = self.lookups.get_table("lemma_index", {})`
			`exc_table = self.lookups.get_table("lemma_exc", {})`
			`rules_table = self.lookups.get_table("lemma_rules", {})`
			`lookup_table = self.lookups.get_table("lemma_lookup", {})`
			`index = index_table.get(univ_pos, {})`
			`exceptions = exc_table.get(univ_pos, {})`
			`rules = rules_table.get(univ_pos, [])`
			`string = string.lower()`
			`forms = []`
			`if string in index:`
			`forms.append(string)`
			`self.cache[cache_key] = forms`
			`return forms`
			`forms.extend(exceptions.get(string, []))`
			`oov_forms = []`
			`if not forms:`
			`for old, new in rules:`
			`if string.endswith(old):`
			`form = string[: len(string) - len(old)] + new`
			`if not form:`
			`pass`
			`elif form in index or not form.isalpha():`
			`forms.append(form)`
			`else:`
			`oov_forms.append(form)`
			`if not forms:`
			`forms.extend(oov_forms)`
Fix lookup usage in French/Catalan (fix #11347) (#11382) * Fix lookup usage (fix #11347) Before using the lookups table in the French (and Catalan) lemmatizers, there's a check to see if the current term is in the table. But it's checking a string against hashes, so it's always false. Also the table lookup function is designed so you don't have to do that anyway. * Use the lookup table directly * Use string, not token 2022-08-29 11:32:38 +03:00
			`# use lookups, and fall back to the token itself`
Update Catalan language data (#8308) * Update Catalan language data Update Catalan language data based on contributions from the Text Mining Unit at the Barcelona Supercomputing Center: https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data * Update tokenizer settings for UD Catalan AnCora Update for UD Catalan AnCora v2.7 with merged multi-word tokens. * Update test * Move prefix patternt to more generic infix pattern * Clean up 2021-06-11 11:21:22 +03:00			`if not forms:`
Fix lookup usage in French/Catalan (fix #11347) (#11382) * Fix lookup usage (fix #11347) Before using the lookups table in the French (and Catalan) lemmatizers, there's a check to see if the current term is in the table. But it's checking a string against hashes, so it's always false. Also the table lookup function is designed so you don't have to do that anyway. * Use the lookup table directly * Use string, not token 2022-08-29 11:32:38 +03:00			`forms.append(lookup_table.get(string, [string])[0])`
Fix inconsistent lemmas (#9405) * Add util function to unique lists and preserve order * Use unique function instead of list(set()) list(set()) has the issue that it's not consistent between runs of the Python interpreter, so order can vary. list(set()) calls were left in a few places where they were behind calls to sorted(). I think in this case the calls to list() can be removed, but this commit doesn't do that. * Use the existing pattern for this 2021-10-11 12:38:45 +03:00			`forms = list(dict.fromkeys(forms))`
Update Catalan language data (#8308) * Update Catalan language data Update Catalan language data based on contributions from the Text Mining Unit at the Barcelona Supercomputing Center: https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data * Update tokenizer settings for UD Catalan AnCora Update for UD Catalan AnCora v2.7 with merged multi-word tokens. * Update test * Move prefix patternt to more generic infix pattern * Clean up 2021-06-11 11:21:22 +03:00			`self.cache[cache_key] = forms`
			`return forms`