spaCy/spacy/lang/fr/lemmatizer.py

# coding: utf8
from __future__ import unicode_literals

from ...lemmatizer import Lemmatizer
from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP
from ...symbols import SCONJ, CCONJ
from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos


class FrenchLemmatizer(Lemmatizer):
    """
    French language lemmatizer applies the default rule based lemmatization
    procedure with some modifications for better French language support.

    The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
    the rule-based lemmatization. As a last resort, the lemmatizer checks in
    the lookup table.
    """

    def __call__(self, string, univ_pos, morphology=None):
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        if "lemma_rules" not in self.lookups:
            return [lookup_table.get(string, string)]
        if univ_pos in (NOUN, "NOUN", "noun"):
            univ_pos = "noun"
        elif univ_pos in (VERB, "VERB", "verb"):
            univ_pos = "verb"
        elif univ_pos in (ADJ, "ADJ", "adj"):
            univ_pos = "adj"
        elif univ_pos in (ADP, "ADP", "adp"):
            univ_pos = "adp"
        elif univ_pos in (ADV, "ADV", "adv"):
            univ_pos = "adv"
        elif univ_pos in (AUX, "AUX", "aux"):
            univ_pos = "aux"
        elif univ_pos in (CCONJ, "CCONJ", "cconj"):
            univ_pos = "cconj"
        elif univ_pos in (DET, "DET", "det"):
            univ_pos = "det"
        elif univ_pos in (PRON, "PRON", "pron"):
            univ_pos = "pron"
        elif univ_pos in (PUNCT, "PUNCT", "punct"):
            univ_pos = "punct"
        elif univ_pos in (SCONJ, "SCONJ", "sconj"):
            univ_pos = "sconj"
        else:
            return [self.lookup(string)]
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(univ_pos, morphology):
            return list(set([string.lower()]))
        index_table = self.lookups.get_table("lemma_index", {})
        exc_table = self.lookups.get_table("lemma_exc", {})
        rules_table = self.lookups.get_table("lemma_rules", {})
        lemmas = self.lemmatize(
            string,
            index_table.get(univ_pos, {}),
            exc_table.get(univ_pos, {}),
            rules_table.get(univ_pos, []),
        )
        return lemmas

    def is_base_form(self, univ_pos, morphology=None):
        """
        Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.
        """
        morphology = {} if morphology is None else morphology
        others = [
            key
            for key in morphology
            if key not in (POS, "Number", "POS", "VerbForm", "Tense")
        ]
        if univ_pos == "noun" and morphology.get("Number") == "sing":
            return True
        elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
            return True
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
        # morphology
        elif univ_pos == "verb" and (
            morphology.get("VerbForm") == "fin"
            and morphology.get("Tense") == "pres"
            and morphology.get("Number") is None
            and not others
        ):
            return True
        elif univ_pos == "adj" and morphology.get("Degree") == "pos":
            return True
        elif VerbForm_inf in morphology:
            return True
        elif VerbForm_none in morphology:
            return True
        elif Number_sing in morphology:
            return True
        elif Degree_pos in morphology:
            return True
        else:
            return False

    def noun(self, string, morphology=None):
        return self(string, "noun", morphology)

    def verb(self, string, morphology=None):
        return self(string, "verb", morphology)

    def adj(self, string, morphology=None):
        return self(string, "adj", morphology)

    def punct(self, string, morphology=None):
        return self(string, "punct", morphology)

    def lookup(self, string, orth=None):
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        if orth is not None and orth in lookup_table:
            return lookup_table[orth][0]
        return string

    def lemmatize(self, string, index, exceptions, rules):
        lookup_table = self.lookups.get_table("lemma_lookup", {})
        string = string.lower()
        forms = []
        if string in index:
            forms.append(string)
            return forms
        forms.extend(exceptions.get(string, []))
        oov_forms = []
        if not forms:
            for old, new in rules:
                if string.endswith(old):
                    form = string[: len(string) - len(old)] + new
                    if not form:
                        pass
                    elif form in index or not form.isalpha():
                        forms.append(form)
                    else:
                        oov_forms.append(form)
        if not forms:
            forms.extend(oov_forms)
        if not forms and string in lookup_table.keys():
            forms.append(lookup_table[string][0])
        if not forms:
            forms.append(string)
        return list(set(forms))
Rule-based French Lemmatizer (#2818) <!--- Provide a general summary of your changes in the title. --> ## Description <!--- Use this section to describe your changes. If your changes required testing, include information about the testing environment and the tests you ran. If your test fixes a bug reported in an issue, don't forget to include the issue number. If your PR is still a work in progress, that's totally fine – just include a note to let us know. --> Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class. ### Types of change <!-- What type of change does your PR cover? Is it a bug fix, an enhancement or new feature, or a change to the documentation? --> - Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version. - Add several files containing exhaustive list of words for each part of speech - Add some lemma rules - Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX - Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned - Modify the lemmatize function to check in lookup table as a last resort - Init files are updated so the model can support all the functionalities mentioned above - Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [X] I have submitted the spaCy Contributor Agreement. - [X] I ran the tests, and all new and existing tests passed. - [X] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-10-13 17:38:21 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`

Refactor lemmatizer and data table integration (#4353) * Move test * Allow default in Lookups.get_table * Start with blank tables in Lookups.from_bytes * Refactor lemmatizer to hold instance of Lookups * Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk) * Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency * Remove old and unsupported Lemmatizer.load classmethod * Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need * Update tests and docs * Fix more tests * Fix lemmatizer * Upgrade pytest to try and fix weird CI errors * Try pytest 4.6.5 2019-10-01 22:36:04 +03:00			`from ...lemmatizer import Lemmatizer`
Move lookup tables out of the core library (#4346) * Add default to util.get_entry_point * Tidy up entry points * Read lookups from entry points * Remove lookup tables and related tests * Add lookups install option * Remove lemmatizer tests * Remove logic to process language data files * Update setup.cfg 2019-10-01 01:01:27 +03:00			`from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP`
			`from ...symbols import SCONJ, CCONJ`
			`from ...symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos`
💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance 2019-08-22 15:21:32 +03:00

Refactor lemmatizer and data table integration (#4353) * Move test * Allow default in Lookups.get_table * Start with blank tables in Lookups.from_bytes * Refactor lemmatizer to hold instance of Lookups * Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk) * Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency * Remove old and unsupported Lemmatizer.load classmethod * Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need * Update tests and docs * Fix more tests * Fix lemmatizer * Upgrade pytest to try and fix weird CI errors * Try pytest 4.6.5 2019-10-01 22:36:04 +03:00			`class FrenchLemmatizer(Lemmatizer):`
💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance 2019-08-22 15:21:32 +03:00			`"""`
			`French language lemmatizer applies the default rule based lemmatization`
			`procedure with some modifications for better French language support.`

			`The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use`
			`the rule-based lemmatization. As a last resort, the lemmatizer checks in`
			`the lookup table.`
			`"""`

			`def __call__(self, string, univ_pos, morphology=None):`
Refactor lemmatizer and data table integration (#4353) * Move test * Allow default in Lookups.get_table * Start with blank tables in Lookups.from_bytes * Refactor lemmatizer to hold instance of Lookups * Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk) * Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency * Remove old and unsupported Lemmatizer.load classmethod * Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need * Update tests and docs * Fix more tests * Fix lemmatizer * Upgrade pytest to try and fix weird CI errors * Try pytest 4.6.5 2019-10-01 22:36:04 +03:00			`lookup_table = self.lookups.get_table("lemma_lookup", {})`
			`if "lemma_rules" not in self.lookups:`
			`return [lookup_table.get(string, string)]`
💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance 2019-08-22 15:21:32 +03:00			`if univ_pos in (NOUN, "NOUN", "noun"):`
			`univ_pos = "noun"`
			`elif univ_pos in (VERB, "VERB", "verb"):`
			`univ_pos = "verb"`
			`elif univ_pos in (ADJ, "ADJ", "adj"):`
			`univ_pos = "adj"`
			`elif univ_pos in (ADP, "ADP", "adp"):`
			`univ_pos = "adp"`
			`elif univ_pos in (ADV, "ADV", "adv"):`
			`univ_pos = "adv"`
			`elif univ_pos in (AUX, "AUX", "aux"):`
			`univ_pos = "aux"`
			`elif univ_pos in (CCONJ, "CCONJ", "cconj"):`
			`univ_pos = "cconj"`
			`elif univ_pos in (DET, "DET", "det"):`
			`univ_pos = "det"`
			`elif univ_pos in (PRON, "PRON", "pron"):`
			`univ_pos = "pron"`
			`elif univ_pos in (PUNCT, "PUNCT", "punct"):`
			`univ_pos = "punct"`
			`elif univ_pos in (SCONJ, "SCONJ", "sconj"):`
			`univ_pos = "sconj"`
			`else:`
💫 Adjust Table API and add docs (#4289) * Adjust Table API and add docs * Add attributes and update description [ci skip] * Use strings.get_string_id instead of hash_string * Fix table method calls * Make orth arg in Lemmatizer.lookup optional Fall back to string, which is now handled by Table.__contains__ out-of-the-box * Fix method name * Auto-format 2019-09-15 23:08:13 +03:00			`return [self.lookup(string)]`
💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance 2019-08-22 15:21:32 +03:00			`# See Issue #435 for example of where this logic is requied.`
			`if self.is_base_form(univ_pos, morphology):`
			`return list(set([string.lower()]))`
Refactor lemmatizer and data table integration (#4353) * Move test * Allow default in Lookups.get_table * Start with blank tables in Lookups.from_bytes * Refactor lemmatizer to hold instance of Lookups * Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk) * Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency * Remove old and unsupported Lemmatizer.load classmethod * Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need * Update tests and docs * Fix more tests * Fix lemmatizer * Upgrade pytest to try and fix weird CI errors * Try pytest 4.6.5 2019-10-01 22:36:04 +03:00			`index_table = self.lookups.get_table("lemma_index", {})`
			`exc_table = self.lookups.get_table("lemma_exc", {})`
			`rules_table = self.lookups.get_table("lemma_rules", {})`
			`lemmas = self.lemmatize(`
💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance 2019-08-22 15:21:32 +03:00			`string,`
Refactor lemmatizer and data table integration (#4353) * Move test * Allow default in Lookups.get_table * Start with blank tables in Lookups.from_bytes * Refactor lemmatizer to hold instance of Lookups * Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk) * Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency * Remove old and unsupported Lemmatizer.load classmethod * Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need * Update tests and docs * Fix more tests * Fix lemmatizer * Upgrade pytest to try and fix weird CI errors * Try pytest 4.6.5 2019-10-01 22:36:04 +03:00			`index_table.get(univ_pos, {}),`
			`exc_table.get(univ_pos, {}),`
			`rules_table.get(univ_pos, []),`
💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance 2019-08-22 15:21:32 +03:00			`)`
			`return lemmas`

			`def is_base_form(self, univ_pos, morphology=None):`
			`"""`
			`Check whether we're dealing with an uninflected paradigm, so we can`
			`avoid lemmatization entirely.`
			`"""`
			`morphology = {} if morphology is None else morphology`
			`others = [`
			`key`
			`for key in morphology`
			`if key not in (POS, "Number", "POS", "VerbForm", "Tense")`
			`]`
			`if univ_pos == "noun" and morphology.get("Number") == "sing":`
			`return True`
			`elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":`
			`return True`
			`# This maps 'VBP' to base form -- probably just need 'IS_BASE'`
			`# morphology`
			`elif univ_pos == "verb" and (`
			`morphology.get("VerbForm") == "fin"`
			`and morphology.get("Tense") == "pres"`
			`and morphology.get("Number") is None`
			`and not others`
			`):`
			`return True`
			`elif univ_pos == "adj" and morphology.get("Degree") == "pos":`
			`return True`
			`elif VerbForm_inf in morphology:`
			`return True`
			`elif VerbForm_none in morphology:`
			`return True`
			`elif Number_sing in morphology:`
			`return True`
			`elif Degree_pos in morphology:`
			`return True`
			`else:`
			`return False`

			`def noun(self, string, morphology=None):`
			`return self(string, "noun", morphology)`

			`def verb(self, string, morphology=None):`
			`return self(string, "verb", morphology)`

			`def adj(self, string, morphology=None):`
			`return self(string, "adj", morphology)`

			`def punct(self, string, morphology=None):`
			`return self(string, "punct", morphology)`

💫 Adjust Table API and add docs (#4289) * Adjust Table API and add docs * Add attributes and update description [ci skip] * Use strings.get_string_id instead of hash_string * Fix table method calls * Make orth arg in Lemmatizer.lookup optional Fall back to string, which is now handled by Table.__contains__ out-of-the-box * Fix method name * Auto-format 2019-09-15 23:08:13 +03:00			`def lookup(self, string, orth=None):`
Refactor lemmatizer and data table integration (#4353) * Move test * Allow default in Lookups.get_table * Start with blank tables in Lookups.from_bytes * Refactor lemmatizer to hold instance of Lookups * Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk) * Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency * Remove old and unsupported Lemmatizer.load classmethod * Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need * Update tests and docs * Fix more tests * Fix lemmatizer * Upgrade pytest to try and fix weird CI errors * Try pytest 4.6.5 2019-10-01 22:36:04 +03:00			`lookup_table = self.lookups.get_table("lemma_lookup", {})`
			`if orth is not None and orth in lookup_table:`
			`return lookup_table[orth][0]`
💫 WIP: Basic lookup class scaffolding and JSON for all lemmati… (#4167) * Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance 2019-08-22 15:21:32 +03:00			`return string`

Refactor lemmatizer and data table integration (#4353) * Move test * Allow default in Lookups.get_table * Start with blank tables in Lookups.from_bytes * Refactor lemmatizer to hold instance of Lookups * Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk) * Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency * Remove old and unsupported Lemmatizer.load classmethod * Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need * Update tests and docs * Fix more tests * Fix lemmatizer * Upgrade pytest to try and fix weird CI errors * Try pytest 4.6.5 2019-10-01 22:36:04 +03:00			`def lemmatize(self, string, index, exceptions, rules):`
			`lookup_table = self.lookups.get_table("lemma_lookup", {})`
			`string = string.lower()`
			`forms = []`
			`if string in index:`
			`forms.append(string)`
			`return forms`
			`forms.extend(exceptions.get(string, []))`
			`oov_forms = []`
			`if not forms:`
			`for old, new in rules:`
			`if string.endswith(old):`
			`form = string[: len(string) - len(old)] + new`
			`if not form:`
			`pass`
			`elif form in index or not form.isalpha():`
			`forms.append(form)`
			`else:`
			`oov_forms.append(form)`
			`if not forms:`
			`forms.extend(oov_forms)`
			`if not forms and string in lookup_table.keys():`
			`forms.append(lookup_table[string][0])`
			`if not forms:`
			`forms.append(string)`
			`return list(set(forms))`