mirror of
https://github.com/explosion/spaCy.git
synced 2025-11-01 00:17:44 +03:00
* Fix scoring normalization (#7629) * fix scoring normalization * score weights by total sum instead of per component * cleanup * more cleanup * Use a context manager when reading model (fix #7036) (#8244) * Fix other open calls without context managers (#8245) * Don't add duplicate patterns all the time in EntityRuler (fix #8216) (#8246) * Don't add duplicate patterns (fix #8216) * Refactor EntityRuler init This simplifies the EntityRuler init code. This is helpful as prep for allowing the EntityRuler to reset itself. * Make EntityRuler.clear reset matchers Includes a new test for this. * Tidy PhraseMatcher instantiation Since the attr can be None safely now, the guard if is no longer required here. Also renamed the `_validate` attr. Maybe it's not needed? * Fix NER test * Add test to make sure patterns aren't increasing * Move test to regression tests * Exclude generated .cpp files from package (#8271) * Fix non-deterministic deduplication in Greek lemmatizer (#8421) * Fix setting empty entities in Example.from_dict (#8426) * Filter W036 for entity ruler, etc. (#8424) * Preserve paths.vectors/initialize.vectors setting in quickstart template * Various fixes for spans in Docs.from_docs (#8487) * Fix spans offsets if a doc ends in a single space and no space is inserted * Also include spans key in merged doc for empty spans lists * Fix duplicate spacy package CLI opts (#8551) Use `-c` for `--code` and not additionally for `--create-meta`, in line with the docs. * Raise an error for textcat with <2 labels (#8584) * Raise an error for textcat with <2 labels Raise an error if initializing a `textcat` component without at least two labels. * Add similar note to docs * Update positive_label description in API docs * Add Macedonian models to website (#8637) * Fix Azerbaijani init, extend lang init tests (#8656) * Extend langs in initialize tests * Fix az init * Fix ru/uk lemmatizer mp with spawn (#8657) Use an instance variable instead a class variable for the morphological analzyer so that multiprocessing with spawn is possible. * Use 0-vector for OOV lexemes (#8639) * Set version to v3.0.7 Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
63 lines
2.1 KiB
Python
63 lines
2.1 KiB
Python
from typing import List
|
|
|
|
from ...pipeline import Lemmatizer
|
|
from ...tokens import Token
|
|
|
|
|
|
class GreekLemmatizer(Lemmatizer):
|
|
"""
|
|
Greek language lemmatizer applies the default rule based lemmatization
|
|
procedure with some modifications for better Greek language support.
|
|
|
|
The first modification is that it checks if the word for lemmatization is
|
|
already a lemma and if yes, it just returns it.
|
|
The second modification is about removing the base forms function which is
|
|
not applicable for Greek language.
|
|
"""
|
|
|
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
|
"""Lemmatize using a rule-based approach.
|
|
|
|
token (Token): The token to lemmatize.
|
|
RETURNS (list): The available lemmas for the string.
|
|
"""
|
|
cache_key = (token.lower, token.pos)
|
|
if cache_key in self.cache:
|
|
return self.cache[cache_key]
|
|
string = token.text
|
|
univ_pos = token.pos_.lower()
|
|
if univ_pos in ("", "eol", "space"):
|
|
return [string.lower()]
|
|
|
|
index_table = self.lookups.get_table("lemma_index", {})
|
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
|
index = index_table.get(univ_pos, {})
|
|
exceptions = exc_table.get(univ_pos, {})
|
|
rules = rules_table.get(univ_pos, {})
|
|
|
|
string = string.lower()
|
|
forms = []
|
|
if string in index:
|
|
forms.append(string)
|
|
return forms
|
|
forms.extend(exceptions.get(string, []))
|
|
oov_forms = []
|
|
if not forms:
|
|
for old, new in rules:
|
|
if string.endswith(old):
|
|
form = string[: len(string) - len(old)] + new
|
|
if not form:
|
|
pass
|
|
elif form in index or not form.isalpha():
|
|
forms.append(form)
|
|
else:
|
|
oov_forms.append(form)
|
|
if not forms:
|
|
forms.extend(oov_forms)
|
|
if not forms:
|
|
forms.append(string)
|
|
forms = list(dict.fromkeys(forms))
|
|
self.cache[cache_key] = forms
|
|
return forms
|