mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
eddeb36c96
<!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
73 lines
2.3 KiB
Python
73 lines
2.3 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ....symbols import NOUN, VERB, ADJ, PUNCT
|
|
|
|
|
|
class GreekLemmatizer(object):
|
|
"""
|
|
Greek language lemmatizer applies the default rule based lemmatization
|
|
procedure with some modifications for better Greek language support.
|
|
|
|
The first modification is that it checks if the word for lemmatization is
|
|
already a lemma and if yes, it just returns it.
|
|
The second modification is about removing the base forms function which is
|
|
not applicable for Greek language.
|
|
"""
|
|
|
|
@classmethod
|
|
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
|
return cls(index, exc, rules, lookup)
|
|
|
|
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
|
self.index = index
|
|
self.exc = exceptions
|
|
self.rules = rules
|
|
self.lookup_table = lookup if lookup is not None else {}
|
|
|
|
def __call__(self, string, univ_pos, morphology=None):
|
|
if not self.rules:
|
|
return [self.lookup_table.get(string, string)]
|
|
if univ_pos in (NOUN, "NOUN", "noun"):
|
|
univ_pos = "noun"
|
|
elif univ_pos in (VERB, "VERB", "verb"):
|
|
univ_pos = "verb"
|
|
elif univ_pos in (ADJ, "ADJ", "adj"):
|
|
univ_pos = "adj"
|
|
elif univ_pos in (PUNCT, "PUNCT", "punct"):
|
|
univ_pos = "punct"
|
|
else:
|
|
return list(set([string.lower()]))
|
|
lemmas = lemmatize(
|
|
string,
|
|
self.index.get(univ_pos, {}),
|
|
self.exc.get(univ_pos, {}),
|
|
self.rules.get(univ_pos, []),
|
|
)
|
|
return lemmas
|
|
|
|
|
|
def lemmatize(string, index, exceptions, rules):
|
|
string = string.lower()
|
|
forms = []
|
|
if string in index:
|
|
forms.append(string)
|
|
return forms
|
|
forms.extend(exceptions.get(string, []))
|
|
oov_forms = []
|
|
if not forms:
|
|
for old, new in rules:
|
|
if string.endswith(old):
|
|
form = string[: len(string) - len(old)] + new
|
|
if not form:
|
|
pass
|
|
elif form in index or not form.isalpha():
|
|
forms.append(form)
|
|
else:
|
|
oov_forms.append(form)
|
|
if not forms:
|
|
forms.extend(oov_forms)
|
|
if not forms:
|
|
forms.append(string)
|
|
return list(set(forms))
|