mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Add spacy.blank() method, that doesn't load data. Don't try to load data if path is falsey
This commit is contained in:
parent
ae202e7a60
commit
722199acb8
|
@ -19,6 +19,23 @@ set_lang_class(de.German.lang, de.German)
|
||||||
set_lang_class(zh.Chinese.lang, zh.Chinese)
|
set_lang_class(zh.Chinese.lang, zh.Chinese)
|
||||||
|
|
||||||
|
|
||||||
|
def blank(name, vocab=None, tokenizer=None, parser=None, tagger=None, entity=None,
|
||||||
|
matcher=None, serializer=None, vectors=None, pipeline=None):
|
||||||
|
target_name, target_version = util.split_data_name(name)
|
||||||
|
cls = get_lang_class(target_name)
|
||||||
|
return cls(
|
||||||
|
path,
|
||||||
|
vectors=vectors,
|
||||||
|
vocab=vocab,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
tagger=tagger,
|
||||||
|
parser=parser,
|
||||||
|
entity=entity,
|
||||||
|
matcher=matcher,
|
||||||
|
pipeline=pipeline,
|
||||||
|
serializer=serializer)
|
||||||
|
|
||||||
|
|
||||||
def load(name, vocab=True, tokenizer=True, parser=True, tagger=True, entity=True,
|
def load(name, vocab=True, tokenizer=True, parser=True, tagger=True, entity=True,
|
||||||
matcher=True, serializer=True, vectors=True, pipeline=True, via=None):
|
matcher=True, serializer=True, vectors=True, pipeline=True, via=None):
|
||||||
if via is None:
|
if via is None:
|
||||||
|
|
|
@ -36,7 +36,7 @@ class BaseDefaults(object):
|
||||||
self.path = path
|
self.path = path
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
self.lex_attr_getters = dict(self.__class__.lex_attr_getters)
|
self.lex_attr_getters = dict(self.__class__.lex_attr_getters)
|
||||||
if (self.path / 'vocab' / 'oov_prob').exists():
|
if self.path and (self.path / 'vocab' / 'oov_prob').exists():
|
||||||
with (self.path / 'vocab' / 'oov_prob').open() as file_:
|
with (self.path / 'vocab' / 'oov_prob').open() as file_:
|
||||||
oov_prob = file_.read().strip()
|
oov_prob = file_.read().strip()
|
||||||
self.lex_attr_getters[PROB] = lambda string: oov_prob
|
self.lex_attr_getters[PROB] = lambda string: oov_prob
|
||||||
|
@ -44,7 +44,7 @@ class BaseDefaults(object):
|
||||||
self.lex_attr_getters[IS_STOP] = lambda string: string in self.stop_words
|
self.lex_attr_getters[IS_STOP] = lambda string: string in self.stop_words
|
||||||
|
|
||||||
def Lemmatizer(self):
|
def Lemmatizer(self):
|
||||||
return Lemmatizer.load(self.path)
|
return Lemmatizer.load(self.path) if self.path else Lemmatizer({}, {}, {})
|
||||||
|
|
||||||
def Vectors(self):
|
def Vectors(self):
|
||||||
return True
|
return True
|
||||||
|
@ -59,9 +59,13 @@ class BaseDefaults(object):
|
||||||
lemmatizer = self.Lemmatizer()
|
lemmatizer = self.Lemmatizer()
|
||||||
if vectors is True:
|
if vectors is True:
|
||||||
vectors = self.Vectors()
|
vectors = self.Vectors()
|
||||||
return Vocab.load(self.path, lex_attr_getters=lex_attr_getters,
|
if self.path:
|
||||||
tag_map=tag_map, lemmatizer=lemmatizer,
|
return Vocab.load(self.path, lex_attr_getters=lex_attr_getters,
|
||||||
serializer_freqs=serializer_freqs)
|
tag_map=tag_map, lemmatizer=lemmatizer,
|
||||||
|
serializer_freqs=serializer_freqs)
|
||||||
|
else:
|
||||||
|
return Vocab(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
||||||
|
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
|
||||||
|
|
||||||
def Tokenizer(self, vocab, rules=None, prefix_search=None, suffix_search=None,
|
def Tokenizer(self, vocab, rules=None, prefix_search=None, suffix_search=None,
|
||||||
infix_finditer=None):
|
infix_finditer=None):
|
||||||
|
@ -73,27 +77,41 @@ class BaseDefaults(object):
|
||||||
suffix_search = util.compile_suffix_regex(self.suffixes).search
|
suffix_search = util.compile_suffix_regex(self.suffixes).search
|
||||||
if infix_finditer is None:
|
if infix_finditer is None:
|
||||||
infix_finditer = util.compile_infix_regex(self.infixes).finditer
|
infix_finditer = util.compile_infix_regex(self.infixes).finditer
|
||||||
return Tokenizer(vocab, rules=rules,
|
if self.path:
|
||||||
prefix_search=prefix_search, suffix_search=suffix_search,
|
return Tokenizer.load(self.path, vocab, rules=rules,
|
||||||
infix_finditer=infix_finditer)
|
prefix_search=prefix_search,
|
||||||
|
suffix_search=suffix_search,
|
||||||
|
infix_finditer=infix_finditer)
|
||||||
|
else:
|
||||||
|
return Tokenizer(vocab, rules=rules,
|
||||||
|
prefix_search=prefix_search, suffix_search=suffix_search,
|
||||||
|
infix_finditer=infix_finditer)
|
||||||
|
|
||||||
def Tagger(self, vocab):
|
def Tagger(self, vocab):
|
||||||
return Tagger.load(self.path / 'pos', vocab)
|
if self.path:
|
||||||
|
return Tagger.load(self.path / 'pos', vocab)
|
||||||
|
else:
|
||||||
|
return Tagger.blank(vocab, Tagger.default_templates(self.lang))
|
||||||
|
|
||||||
def Parser(self, vocab):
|
def Parser(self, vocab):
|
||||||
if (self.path / 'deps').exists():
|
if self.path:
|
||||||
return Parser.load(self.path / 'deps', vocab, ArcEager)
|
return Parser.load(self.path / 'deps', vocab, ArcEager)
|
||||||
else:
|
else:
|
||||||
return None
|
return Parser.blank(vocab, ArcEager,
|
||||||
|
Parser.default_templates('%s-parser' % self.lang))
|
||||||
|
|
||||||
def Entity(self, vocab):
|
def Entity(self, vocab):
|
||||||
if (self.path / 'ner').exists():
|
if self.path and (self.path / 'ner').exists():
|
||||||
return Parser.load(self.path / 'ner', vocab, BiluoPushDown)
|
return Parser.load(self.path / 'ner', vocab, BiluoPushDown)
|
||||||
else:
|
else:
|
||||||
return None
|
return Parser.blank(vocab, BiluoPushdown,
|
||||||
|
Parser.default_templates('%s-entity' % self.lang))
|
||||||
|
|
||||||
def Matcher(self, vocab):
|
def Matcher(self, vocab):
|
||||||
return Matcher.load(self.path, vocab)
|
if self.path:
|
||||||
|
return Matcher.load(self.path, vocab)
|
||||||
|
else:
|
||||||
|
return Matcher(vocab)
|
||||||
|
|
||||||
def Pipeline(self, nlp):
|
def Pipeline(self, nlp):
|
||||||
return [
|
return [
|
||||||
|
|
Loading…
Reference in New Issue
Block a user