Wire up lemmatizer rules for English

This commit is contained in:
Matthew Honnibal 2016-12-18 15:50:09 +01:00
parent e6fc4afb04
commit 44f4f008bd
3 changed files with 12 additions and 4 deletions

View File

@ -19,6 +19,8 @@ from ..language_data import EMOTICONS
from .language_data import ORTH_ONLY from .language_data import ORTH_ONLY
from .language_data import get_time_exc from .language_data import get_time_exc
from .lemma_rules import LEMMA_RULES
TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS) TOKENIZER_EXCEPTIONS = dict(language_data.TOKENIZER_EXCEPTIONS)
TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES) TOKENIZER_PREFIXES = tuple(language_data.TOKENIZER_PREFIXES)
@ -47,3 +49,4 @@ class English(Language):
infixes = TOKENIZER_INFIXES infixes = TOKENIZER_INFIXES
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES

View File

@ -38,7 +38,7 @@ class BaseDefaults(object):
if nlp is None or nlp.path is None: if nlp is None or nlp.path is None:
return Lemmatizer({}, {}, {}) return Lemmatizer({}, {}, {})
else: else:
return Lemmatizer.load(nlp.path) return Lemmatizer.load(nlp.path, rules=self.lemma_rules)
@classmethod @classmethod
def create_vocab(cls, nlp=None): def create_vocab(cls, nlp=None):
@ -159,6 +159,8 @@ class BaseDefaults(object):
stop_words = set() stop_words = set()
lemma_rules = {}
lex_attr_getters = { lex_attr_getters = {
attrs.LOWER: lambda string: string.lower(), attrs.LOWER: lambda string: string.lower(),
attrs.NORM: lambda string: string, attrs.NORM: lambda string: string,

View File

@ -9,7 +9,7 @@ from .symbols import POS, NOUN, VERB, ADJ, PUNCT
class Lemmatizer(object): class Lemmatizer(object):
@classmethod @classmethod
def load(cls, path): def load(cls, path, rules=None):
index = {} index = {}
exc = {} exc = {}
for pos in ['adj', 'noun', 'verb']: for pos in ['adj', 'noun', 'verb']:
@ -25,8 +25,11 @@ class Lemmatizer(object):
exc[pos] = read_exc(file_) exc[pos] = read_exc(file_)
else: else:
exc[pos] = {} exc[pos] = {}
with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_: if rules is None and (path / 'vocab' / 'lemma_rules.json').exists():
rules = json.load(file_) with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
rules = json.load(file_)
elif rules is None:
rules = {}
return cls(index, exc, rules) return cls(index, exc, rules)
def __init__(self, index, exceptions, rules): def __init__(self, index, exceptions, rules):