mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-30 20:06:30 +03:00
7d8df69158
* Improve load_language_data helper * WIP: Add Lookups implementation * Start moving lemma data over to JSON * WIP: move data over for more languages * Convert more languages * Fix lemmatizer fixtures in tests * Finish conversion * Auto-format JSON files * Fix test for now * Make sure tables are stored on instance * Update docstrings * Update docstrings and errors * Update test * Add Lookups.__len__ * Add serialization methods * Add Lookups.remove_table * Use msgpack for serialization to disk * Fix file exists check * Try using OrderedDict for everything * Update .flake8 [ci skip] * Try fixing serialization * Update test_lookups.py * Update test_serialize_vocab_strings.py * Lookups / Tables now work This implements the stubs in the Lookups/Table classes. Currently this is in Cython but with no type declarations, so that could be improved. * Add lookups to setup.py * Actually add lookups pyx The previous commit added the old py file... * Lookups work-in-progress * Move from pyx back to py * Add string based lookups, fix serialization * Update tests, language/lemmatizer to work with string lookups There are some outstanding issues here: - a pickling-related test fails due to the bloom filter - some custom lemmatizers (fr/nl at least) have issues More generally, there's a question of how to deal with the case where you have a string but want to use the lookup table. Currently the table allows access by string or id, but that's getting pretty awkward. * Change lemmatizer lookup method to pass (orth, string) * Fix token lookup * Fix French lookup * Fix lt lemmatizer test * Fix Dutch lemmatizer * Fix lemmatizer lookup test This was using a normal dict instead of a Table, so checks for the string instead of an integer key failed. * Make uk/nl/ru lemmatizer lookup methods consistent The mentioned tokenizers all have their own implementation of the `lookup` method, which accesses a `Lookups` table. The way that was called in `token.pyx` was changed so this should be updated to have the same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id, string)). Prior to this change tests weren't failing, but there would probably be issues with normal use of a model. More tests should proably be added. Additionally, the language-specific `lookup` implementations seem like they might not be needed, since they handle things like lower-casing that aren't actually language specific. * Make recently added Greek method compatible * Remove redundant class/method Leftovers from a merge not cleaned up adequately.
78 lines
2.4 KiB
Python
78 lines
2.4 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from ....symbols import NOUN, VERB, ADJ, PUNCT
|
|
|
|
|
|
class GreekLemmatizer(object):
|
|
"""
|
|
Greek language lemmatizer applies the default rule based lemmatization
|
|
procedure with some modifications for better Greek language support.
|
|
|
|
The first modification is that it checks if the word for lemmatization is
|
|
already a lemma and if yes, it just returns it.
|
|
The second modification is about removing the base forms function which is
|
|
not applicable for Greek language.
|
|
"""
|
|
|
|
@classmethod
|
|
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
|
return cls(index, exc, rules, lookup)
|
|
|
|
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
|
self.index = index
|
|
self.exc = exceptions
|
|
self.rules = rules
|
|
self.lookup_table = lookup if lookup is not None else {}
|
|
|
|
def __call__(self, string, univ_pos, morphology=None):
|
|
if not self.rules:
|
|
return [self.lookup_table.get(string, string)]
|
|
if univ_pos in (NOUN, "NOUN", "noun"):
|
|
univ_pos = "noun"
|
|
elif univ_pos in (VERB, "VERB", "verb"):
|
|
univ_pos = "verb"
|
|
elif univ_pos in (ADJ, "ADJ", "adj"):
|
|
univ_pos = "adj"
|
|
elif univ_pos in (PUNCT, "PUNCT", "punct"):
|
|
univ_pos = "punct"
|
|
else:
|
|
return list(set([string.lower()]))
|
|
lemmas = lemmatize(
|
|
string,
|
|
self.index.get(univ_pos, {}),
|
|
self.exc.get(univ_pos, {}),
|
|
self.rules.get(univ_pos, []),
|
|
)
|
|
return lemmas
|
|
|
|
def lookup(self, orth, string):
|
|
if orth in self.lookup_table:
|
|
return self.lookup_table[orth]
|
|
return string
|
|
|
|
|
|
def lemmatize(string, index, exceptions, rules):
|
|
string = string.lower()
|
|
forms = []
|
|
if string in index:
|
|
forms.append(string)
|
|
return forms
|
|
forms.extend(exceptions.get(string, []))
|
|
oov_forms = []
|
|
if not forms:
|
|
for old, new in rules:
|
|
if string.endswith(old):
|
|
form = string[: len(string) - len(old)] + new
|
|
if not form:
|
|
pass
|
|
elif form in index or not form.isalpha():
|
|
forms.append(form)
|
|
else:
|
|
oov_forms.append(form)
|
|
if not forms:
|
|
forms.extend(oov_forms)
|
|
if not forms:
|
|
forms.append(string)
|
|
return list(set(forms))
|