diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 2ac839120..b19e49a36 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -21,3 +21,4 @@ class English(Language): tokenizer_exceptions = TOKENIZER_EXCEPTIONS tag_map = TAG_MAP stop_words = STOP_WORDS + lemma_rules = LEMMA_RULES diff --git a/spacy/language.py b/spacy/language.py index 222aadf16..2019c7d88 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -39,7 +39,7 @@ class BaseDefaults(object): if nlp is None or nlp.path is None: return Lemmatizer({}, {}, {}) else: - return Lemmatizer.load(nlp.path) + return Lemmatizer.load(nlp.path, rules=self.lemma_rules) @classmethod def create_vocab(cls, nlp=None): @@ -160,6 +160,8 @@ class BaseDefaults(object): stop_words = set() + lemma_rules = {} + lex_attr_getters = { attrs.LOWER: lambda string: string.lower(), attrs.NORM: lambda string: string, diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index a79ecb009..960467a0b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -9,7 +9,7 @@ from .symbols import POS, NOUN, VERB, ADJ, PUNCT class Lemmatizer(object): @classmethod - def load(cls, path): + def load(cls, path, rules=None): index = {} exc = {} for pos in ['adj', 'noun', 'verb']: @@ -25,8 +25,11 @@ class Lemmatizer(object): exc[pos] = read_exc(file_) else: exc[pos] = {} - with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_: - rules = json.load(file_) + if rules is None and (path / 'vocab' / 'lemma_rules.json').exists(): + with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_: + rules = json.load(file_) + elif rules is None: + rules = {} return cls(index, exc, rules) def __init__(self, index, exceptions, rules): diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index fbcbc2e66..fb6273753 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -140,6 +140,7 @@ cdef class Morphology: lemma = self.strings[lemma_string] return lemma + IDS = { "Animacy_anim": Animacy_anim, "Animacy_inam": Animacy_inam,