mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 09:42:26 +03:00
* Update init_model, making language resources optional
This commit is contained in:
parent
424854028f
commit
386246db5b
|
@ -44,6 +44,9 @@ def setup_tokenizer(lang_data_dir, tok_dir):
|
||||||
|
|
||||||
|
|
||||||
def _read_clusters(loc):
|
def _read_clusters(loc):
|
||||||
|
if not loc.exists():
|
||||||
|
print "Warning: Clusters file not found"
|
||||||
|
return {}
|
||||||
clusters = {}
|
clusters = {}
|
||||||
for line in codecs.open(str(loc), 'r', 'utf8'):
|
for line in codecs.open(str(loc), 'r', 'utf8'):
|
||||||
try:
|
try:
|
||||||
|
@ -68,6 +71,9 @@ def _read_clusters(loc):
|
||||||
|
|
||||||
|
|
||||||
def _read_probs(loc):
|
def _read_probs(loc):
|
||||||
|
if not loc.exists():
|
||||||
|
print "Warning: Probabilities file not found"
|
||||||
|
return {}
|
||||||
probs = {}
|
probs = {}
|
||||||
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
|
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
|
||||||
prob, word = line.split()
|
prob, word = line.split()
|
||||||
|
@ -78,6 +84,9 @@ def _read_probs(loc):
|
||||||
|
|
||||||
def _read_senses(loc):
|
def _read_senses(loc):
|
||||||
lexicon = defaultdict(lambda: defaultdict(list))
|
lexicon = defaultdict(lambda: defaultdict(list))
|
||||||
|
if not loc.exists():
|
||||||
|
print "Warning: WordNet senses not found"
|
||||||
|
return lexicon
|
||||||
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
|
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
|
||||||
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
|
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
|
||||||
for line in codecs.open(str(loc), 'r', 'utf8'):
|
for line in codecs.open(str(loc), 'r', 'utf8'):
|
||||||
|
@ -99,6 +108,8 @@ def setup_vocab(src_dir, dst_dir):
|
||||||
vectors_src = src_dir / 'vectors.tgz'
|
vectors_src = src_dir / 'vectors.tgz'
|
||||||
if vectors_src.exists():
|
if vectors_src.exists():
|
||||||
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
|
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
|
||||||
|
else:
|
||||||
|
print "Warning: Word vectors file not found"
|
||||||
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
|
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
|
||||||
clusters = _read_clusters(src_dir / 'clusters.txt')
|
clusters = _read_clusters(src_dir / 'clusters.txt')
|
||||||
probs = _read_probs(src_dir / 'words.sgt.prob')
|
probs = _read_probs(src_dir / 'words.sgt.prob')
|
||||||
|
|
Loading…
Reference in New Issue
Block a user