mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 18:36:36 +03:00
cf65a80f36
* Move test * Allow default in Lookups.get_table * Start with blank tables in Lookups.from_bytes * Refactor lemmatizer to hold instance of Lookups * Get lookups table within the lemmatization methods to make sure it references the correct table (even if the table was replaced or modified, e.g. when loading a model from disk) * Deprecate other arguments on Lemmatizer.__init__ and expect Lookups for consistency * Remove old and unsupported Lemmatizer.load classmethod * Refactor language-specific lemmatizers to inherit as much as possible from base class and override only what they need * Update tests and docs * Fix more tests * Fix lemmatizer * Upgrade pytest to try and fix weird CI errors * Try pytest 4.6.5
41 lines
1.3 KiB
Python
41 lines
1.3 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ...lemmatizer import Lemmatizer
|
|
|
|
|
|
class GreekLemmatizer(Lemmatizer):
|
|
"""
|
|
Greek language lemmatizer applies the default rule based lemmatization
|
|
procedure with some modifications for better Greek language support.
|
|
|
|
The first modification is that it checks if the word for lemmatization is
|
|
already a lemma and if yes, it just returns it.
|
|
The second modification is about removing the base forms function which is
|
|
not applicable for Greek language.
|
|
"""
|
|
|
|
def lemmatize(self, string, index, exceptions, rules):
|
|
string = string.lower()
|
|
forms = []
|
|
if string in index:
|
|
forms.append(string)
|
|
return forms
|
|
forms.extend(exceptions.get(string, []))
|
|
oov_forms = []
|
|
if not forms:
|
|
for old, new in rules:
|
|
if string.endswith(old):
|
|
form = string[: len(string) - len(old)] + new
|
|
if not form:
|
|
pass
|
|
elif form in index or not form.isalpha():
|
|
forms.append(form)
|
|
else:
|
|
oov_forms.append(form)
|
|
if not forms:
|
|
forms.extend(oov_forms)
|
|
if not forms:
|
|
forms.append(string)
|
|
return list(set(forms))
|