spaCy/spacy/lang/el/lemmatizer.py
Ines Montani db55577c45
Drop Python 2.7 and 3.5 (#4828)
* Remove unicode declarations

* Remove Python 3.5 and 2.7 from CI

* Don't require pathlib

* Replace compat helpers

* Remove OrderedDict

* Use f-strings

* Set Cython compiler language level

* Fix typo

* Re-add OrderedDict for Table

* Update setup.cfg

* Revert CONTRIBUTING.md

* Revert lookups.md

* Revert top-level.md

* Small adjustments and docs [ci skip]
2019-12-22 01:53:56 +01:00

38 lines
1.3 KiB
Python

from ...lemmatizer import Lemmatizer
class GreekLemmatizer(Lemmatizer):
"""
Greek language lemmatizer applies the default rule based lemmatization
procedure with some modifications for better Greek language support.
The first modification is that it checks if the word for lemmatization is
already a lemma and if yes, it just returns it.
The second modification is about removing the base forms function which is
not applicable for Greek language.
"""
def lemmatize(self, string, index, exceptions, rules):
string = string.lower()
forms = []
if string in index:
forms.append(string)
return forms
forms.extend(exceptions.get(string, []))
oov_forms = []
if not forms:
for old, new in rules:
if string.endswith(old):
form = string[: len(string) - len(old)] + new
if not form:
pass
elif form in index or not form.isalpha():
forms.append(form)
else:
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(string)
return list(set(forms))