Merge remote-tracking branch 'origin/organize-language-data' into organize-language-data

2025-07-15 10:42:34 +03:00 · 2016-12-18 16:57:12 +01:00 · 2016-12-18 16:57:12 +01:00 · b11d8cd3db
commit b11d8cd3db
parent d1c1d3f9cd 44f4f008bd
4 changed files with 11 additions and 4 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -21,3 +21,4 @@ class English(Language):
        tokenizer_exceptions = TOKENIZER_EXCEPTIONS
        tag_map = TAG_MAP
        stop_words = STOP_WORDS
+        lemma_rules = LEMMA_RULES
--- a/spacy/language.py
+++ b/spacy/language.py
@ -39,7 +39,7 @@ class BaseDefaults(object):
        if nlp is None or nlp.path is None:
            return Lemmatizer({}, {}, {})
        else:
-            return Lemmatizer.load(nlp.path)
+            return Lemmatizer.load(nlp.path, rules=self.lemma_rules)

    @classmethod
    def create_vocab(cls, nlp=None):
@ -160,6 +160,8 @@ class BaseDefaults(object):

    stop_words = set()

+    lemma_rules = {}
+
    lex_attr_getters = {
        attrs.LOWER: lambda string: string.lower(),
        attrs.NORM: lambda string: string,
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -9,7 +9,7 @@ from .symbols import POS, NOUN, VERB, ADJ, PUNCT

 class Lemmatizer(object):
    @classmethod
-    def load(cls, path):
+    def load(cls, path, rules=None):
        index = {}
        exc = {}
        for pos in ['adj', 'noun', 'verb']:
@ -25,8 +25,11 @@ class Lemmatizer(object):
                    exc[pos] = read_exc(file_)
            else:
                exc[pos] = {}
+        if rules is None and (path / 'vocab' / 'lemma_rules.json').exists():
            with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
                rules = json.load(file_)
+        elif rules is None:
+            rules = {}
        return cls(index, exc, rules)

    def __init__(self, index, exceptions, rules):
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -140,6 +140,7 @@ cdef class Morphology:
        lemma = self.strings[lemma_string]
        return lemma

+
 IDS = {
    "Animacy_anim": Animacy_anim,
    "Animacy_inam": Animacy_inam,