From e6fc4afb04a0e6f442793b6b420d4a0e381cd758 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 15:48:00 +0100 Subject: [PATCH 1/2] Whitespace --- spacy/morphology.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index fbcbc2e66..fb6273753 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -140,6 +140,7 @@ cdef class Morphology: lemma = self.strings[lemma_string] return lemma + IDS = { "Animacy_anim": Animacy_anim, "Animacy_inam": Animacy_inam, From 44f4f008bd7df82647b215466c6b268847f53f44 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 18 Dec 2016 15:50:09 +0100 Subject: [PATCH 2/2] Wire up lemmatizer rules for English --- spacy/en/__init__.py | 3 +++ spacy/language.py | 4 +++- spacy/lemmatizer.py | 9 ++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 623585f7d..6e706db52 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -19,6 +19,8 @@ from ..language_data import EMOTICONS from .language_data import ORTH_ONLY from .language_data import get_time_exc +from .lemma_rules import LEMMA_RULES + TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) @@ -47,3 +49,4 @@ class English(Language): infixes = TOKENIZER_INFIXES tag_map = TAG_MAP stop_words = STOP_WORDS + lemma_rules = LEMMA_RULES diff --git a/spacy/language.py b/spacy/language.py index 78dbac953..7d3d91846 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -38,7 +38,7 @@ class BaseDefaults(object): if nlp is None or nlp.path is None: return Lemmatizer({}, {}, {}) else: - return Lemmatizer.load(nlp.path) + return Lemmatizer.load(nlp.path, rules=self.lemma_rules) @classmethod def create_vocab(cls, nlp=None): @@ -159,6 +159,8 @@ class BaseDefaults(object): stop_words = set() + lemma_rules = {} + lex_attr_getters = { attrs.LOWER: lambda string: string.lower(), attrs.NORM: lambda string: string, diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index a79ecb009..960467a0b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -9,7 +9,7 @@ from .symbols import POS, NOUN, VERB, ADJ, PUNCT class Lemmatizer(object): @classmethod - def load(cls, path): + def load(cls, path, rules=None): index = {} exc = {} for pos in ['adj', 'noun', 'verb']: @@ -25,8 +25,11 @@ class Lemmatizer(object): exc[pos] = read_exc(file_) else: exc[pos] = {} - with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_: - rules = json.load(file_) + if rules is None and (path / 'vocab' / 'lemma_rules.json').exists(): + with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_: + rules = json.load(file_) + elif rules is None: + rules = {} return cls(index, exc, rules) def __init__(self, index, exceptions, rules):