Use lemmatizer in code, not from downloaded model.

This commit is contained in:
Matthew Honnibal 2017-03-15 04:52:50 -05:00
parent 42ba740dde
commit f70be44746
3 changed files with 9 additions and 5 deletions

View File

@ -31,6 +31,10 @@ class English(Language):
tag_map = TAG_MAP tag_map = TAG_MAP
stop_words = STOP_WORDS stop_words = STOP_WORDS
lemma_rules = dict(LEMMA_RULES)
lemma_index = dict(LEMMA_INDEX)
lemma_exc = dict(LEMMA_EXC)
def __init__(self, **overrides): def __init__(self, **overrides):
# Make a special-case hack for loading the GloVe vectors, to support # Make a special-case hack for loading the GloVe vectors, to support

View File

@ -21,7 +21,7 @@ EXC = {
"adj": ADJECTIVES_IRREG, "adj": ADJECTIVES_IRREG,
"adv": ADVERBS_IRREG, "adv": ADVERBS_IRREG,
"noun": NOUNS_IRREG, "noun": NOUNS_IRREG,
"verbs": VERBS_IRREG "verb": VERBS_IRREG
} }

View File

@ -33,6 +33,7 @@ from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
from .syntax.parser import get_templates from .syntax.parser import get_templates
from .syntax.nonproj import PseudoProjectivity from .syntax.nonproj import PseudoProjectivity
from .pipeline import DependencyParser, EntityRecognizer from .pipeline import DependencyParser, EntityRecognizer
from .pipeline import BeamDependencyParser, BeamEntityRecognizer
from .syntax.arc_eager import ArcEager from .syntax.arc_eager import ArcEager
from .syntax.ner import BiluoPushDown from .syntax.ner import BiluoPushDown
@ -40,10 +41,7 @@ from .syntax.ner import BiluoPushDown
class BaseDefaults(object): class BaseDefaults(object):
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None): def create_lemmatizer(cls, nlp=None):
if nlp is None or nlp.path is None: return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules)
return Lemmatizer({}, {}, {})
else:
return Lemmatizer.load(nlp.path, rules=cls.lemma_rules)
@classmethod @classmethod
def create_vocab(cls, nlp=None): def create_vocab(cls, nlp=None):
@ -169,6 +167,8 @@ class BaseDefaults(object):
stop_words = set() stop_words = set()
lemma_rules = {} lemma_rules = {}
lemma_exc = {}
lemma_index = {}
lex_attr_getters = { lex_attr_getters = {
attrs.LOWER: lambda string: string.lower(), attrs.LOWER: lambda string: string.lower(),