spaCy/spacy/lemmatizer.py

from __future__ import unicode_literals, print_function
import codecs
import pathlib

import ujson as json

from .en.lemmatizer import INDEX, EXC, RULES
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import VerbForm_inf, VerbForm_none


class Lemmatizer(object):
    @classmethod
    def load(cls, path, rules=None):
        index = dict(INDEX)
        exc = dict(EXC)
        rules = dict(RULES)
        return cls(index, exc, rules)

    def __init__(self, index, exceptions, rules):
        self.index = index
        self.exc = exceptions
        self.rules = rules

    def __call__(self, string, univ_pos, morphology=None):
        if univ_pos == NOUN:
            univ_pos = 'noun'
        elif univ_pos == VERB:
            univ_pos = 'verb'
        elif univ_pos == ADJ:
            univ_pos = 'adj'
        elif univ_pos == PUNCT:
            univ_pos = 'punct'
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(univ_pos, morphology):
            return set([string.lower()])
        lemmas = lemmatize(string, self.index.get(univ_pos, {}),
                           self.exc.get(univ_pos, {}),
                           self.rules.get(univ_pos, []))
        return lemmas

    def is_base_form(self, univ_pos, morphology=None):
        '''Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.'''
        morphology = {} if morphology is None else morphology
        others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
        true_morph_key = morphology.get('morph', 0)
        if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
            return True
        elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
            return True
        elif true_morph_key in (VerbForm_inf, VerbForm_none):
            return True
        else:
            return False

    def noun(self, string, morphology=None):
        return self(string, 'noun', morphology)

    def verb(self, string, morphology=None):
        return self(string, 'verb', morphology)

    def adj(self, string, morphology=None):
        return self(string, 'adj', morphology)

    def punct(self, string, morphology=None):
        return self(string, 'punct', morphology)


def lemmatize(string, index, exceptions, rules):
    string = string.lower()
    forms = []
    # TODO: Is this correct? See discussion in Issue #435.
    #if string in index:
    #    forms.append(string)
    forms.extend(exceptions.get(string, []))
    oov_forms = []
    for old, new in rules:
        if string.endswith(old):
            form = string[:len(string) - len(old)] + new
            if not form:
                pass
            elif form in index or not form.isalpha():
                forms.append(form)
            else:
                oov_forms.append(form)
    if not forms:
        forms.extend(oov_forms)
    if not forms:
        forms.append(string)
    return set(forms)
* Add support for punctuation lemmatization, to handle unicode characters. This should help in addressing Issue #130 2015-10-09 10:44:21 +03:00			`from __future__ import unicode_literals, print_function`
* Fix data reading for lemmatizer 2015-01-04 22:01:32 +03:00			`import codecs`
Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`import pathlib`
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00
Fix json loading, for Python 3. 2016-10-20 22:23:26 +03:00			`import ujson as json`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00
Use new Lemmatizer data and remove file import Since there's currently only an English lemmatizer, the global Lemmatizer imports from spacy.en. This is unideal and still needs to be fixed. 2017-03-12 15:48:27 +03:00			`from .en.lemmatizer import INDEX, EXC, RULES`
Check POS key in lemmatizer, to update it for new data format 2016-12-18 15:28:20 +03:00			`from .symbols import POS, NOUN, VERB, ADJ, PUNCT`
Update base-form check in lemmatizer, for UD 2.0 morphology 2017-03-17 01:59:31 +03:00			`from .symbols import VerbForm_inf, VerbForm_none`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00
			`class Lemmatizer(object):`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`@classmethod`
Wire up lemmatizer rules for English 2016-12-18 17:50:09 +03:00			`def load(cls, path, rules=None):`
Use new Lemmatizer data and remove file import Since there's currently only an English lemmatizer, the global Lemmatizer imports from spacy.en. This is unideal and still needs to be fixed. 2017-03-12 15:48:27 +03:00			`index = dict(INDEX)`
			`exc = dict(EXC)`
			`rules = dict(RULES)`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`return cls(index, exc, rules)`
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`def __init__(self, index, exceptions, rules):`
			`self.index = index`
			`self.exc = exceptions`
			`self.rules = rules`
* Generalize lemmatizer 2015-08-25 16:46:19 +03:00
Change morphology and lemmatizer API Take morphology features as object instead of keyword arguments 2016-12-07 23:12:49 +03:00			`def __call__(self, string, univ_pos, morphology=None):`
Fix pos name conflict with morphology 2016-09-27 15:16:22 +03:00			`if univ_pos == NOUN:`
			`univ_pos = 'noun'`
			`elif univ_pos == VERB:`
			`univ_pos = 'verb'`
			`elif univ_pos == ADJ:`
			`univ_pos = 'adj'`
			`elif univ_pos == PUNCT:`
			`univ_pos = 'punct'`
Pass lemmatizer morphological features, so that rules are sensitive to base/inflected distinction, which is how the WordNet data is designed. See Issue #435 2016-09-27 14:52:11 +03:00			`# See Issue #435 for example of where this logic is requied.`
Change morphology and lemmatizer API Take morphology features as object instead of keyword arguments 2016-12-07 23:12:49 +03:00			`if self.is_base_form(univ_pos, morphology):`
Pass lemmatizer morphological features, so that rules are sensitive to base/inflected distinction, which is how the WordNet data is designed. See Issue #435 2016-09-27 14:52:11 +03:00			`return set([string.lower()])`
Fix lemmatizer 2016-09-27 18:47:05 +03:00			`lemmas = lemmatize(string, self.index.get(univ_pos, {}),`
			`self.exc.get(univ_pos, {}),`
			`self.rules.get(univ_pos, []))`
* Fix lemmatizer 2015-09-08 16:38:03 +03:00			`return lemmas`
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00
Change morphology and lemmatizer API Take morphology features as object instead of keyword arguments 2016-12-07 23:12:49 +03:00			`def is_base_form(self, univ_pos, morphology=None):`
Pass lemmatizer morphological features, so that rules are sensitive to base/inflected distinction, which is how the WordNet data is designed. See Issue #435 2016-09-27 14:52:11 +03:00			`'''Check whether we're dealing with an uninflected paradigm, so we can`
			`avoid lemmatization entirely.'''`
Change morphology and lemmatizer API Take morphology features as object instead of keyword arguments 2016-12-07 23:12:49 +03:00			`morphology = {} if morphology is None else morphology`
Check POS key in lemmatizer, to update it for new data format 2016-12-18 15:28:20 +03:00			`others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]`
Update base-form check in lemmatizer, for UD 2.0 morphology 2017-03-17 01:59:31 +03:00			`true_morph_key = morphology.get('morph', 0)`
Fix lemmatizer 2016-09-27 18:47:05 +03:00			`if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:`
Pass lemmatizer morphological features, so that rules are sensitive to base/inflected distinction, which is how the WordNet data is designed. See Issue #435 2016-09-27 14:52:11 +03:00			`return True`
Fix lemmatizer 2016-09-27 18:47:05 +03:00			`elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:`
Pass lemmatizer morphological features, so that rules are sensitive to base/inflected distinction, which is how the WordNet data is designed. See Issue #435 2016-09-27 14:52:11 +03:00			`return True`
Update base-form check in lemmatizer, for UD 2.0 morphology 2017-03-17 01:59:31 +03:00			`elif true_morph_key in (VerbForm_inf, VerbForm_none):`
			`return True`
Pass lemmatizer morphological features, so that rules are sensitive to base/inflected distinction, which is how the WordNet data is designed. See Issue #435 2016-09-27 14:52:11 +03:00			`else:`
			`return False`
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00
Change morphology and lemmatizer API Take morphology features as object instead of keyword arguments 2016-12-07 23:12:49 +03:00			`def noun(self, string, morphology=None):`
			`return self(string, 'noun', morphology)`
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00
Change morphology and lemmatizer API Take morphology features as object instead of keyword arguments 2016-12-07 23:12:49 +03:00			`def verb(self, string, morphology=None):`
			`return self(string, 'verb', morphology)`
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00
Change morphology and lemmatizer API Take morphology features as object instead of keyword arguments 2016-12-07 23:12:49 +03:00			`def adj(self, string, morphology=None):`
			`return self(string, 'adj', morphology)`
Pass lemmatizer morphological features, so that rules are sensitive to base/inflected distinction, which is how the WordNet data is designed. See Issue #435 2016-09-27 14:52:11 +03:00
Change morphology and lemmatizer API Take morphology features as object instead of keyword arguments 2016-12-07 23:12:49 +03:00			`def punct(self, string, morphology=None):`
			`return self(string, 'punct', morphology)`
* Add support for punctuation lemmatization, to handle unicode characters. This should help in addressing Issue #130 2015-10-09 10:44:21 +03:00
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00
			`def lemmatize(string, index, exceptions, rules):`
			`string = string.lower()`
			`forms = []`
Pass lemmatizer morphological features, so that rules are sensitive to base/inflected distinction, which is how the WordNet data is designed. See Issue #435 2016-09-27 14:52:11 +03:00			`# TODO: Is this correct? See discussion in Issue #435.`
			`#if string in index:`
			`# forms.append(string)`
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00			`forms.extend(exceptions.get(string, []))`
apply patch 2017-03-01 23:44:17 +03:00			`oov_forms = []`
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00			`for old, new in rules:`
			`if string.endswith(old):`
			`form = string[:len(string) - len(old)] + new`
Fix #719: Lemmatizer can no longer output empty string 2017-03-18 18:02:06 +03:00			`if not form:`
			`pass`
			`elif form in index or not form.isalpha():`
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00			`forms.append(form)`
apply patch 2017-03-01 23:44:17 +03:00			`else:`
			`oov_forms.append(form)`
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00			`if not forms:`
apply patch 2017-03-01 23:44:17 +03:00			`forms.extend(oov_forms)`
Update base-form check in lemmatizer, for UD 2.0 morphology 2017-03-17 01:59:31 +03:00			`if not forms:`
			`forms.append(string)`
* Move lemmatizer to en dir 2014-12-23 07:16:57 +03:00			`return set(forms)`