mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
* Fix structure of wordnet directory for init_model
This commit is contained in:
parent
16617142b7
commit
4af2595d99
|
@ -15,6 +15,8 @@ Requires:
|
||||||
* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
|
* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
|
||||||
* vectors.tgz --- output of something like word2vec
|
* vectors.tgz --- output of something like word2vec
|
||||||
"""
|
"""
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -45,7 +47,7 @@ def setup_tokenizer(lang_data_dir, tok_dir):
|
||||||
|
|
||||||
def _read_clusters(loc):
|
def _read_clusters(loc):
|
||||||
if not loc.exists():
|
if not loc.exists():
|
||||||
print "Warning: Clusters file not found"
|
print("Warning: Clusters file not found")
|
||||||
return {}
|
return {}
|
||||||
clusters = {}
|
clusters = {}
|
||||||
for line in codecs.open(str(loc), 'r', 'utf8'):
|
for line in codecs.open(str(loc), 'r', 'utf8'):
|
||||||
|
@ -72,7 +74,7 @@ def _read_clusters(loc):
|
||||||
|
|
||||||
def _read_probs(loc):
|
def _read_probs(loc):
|
||||||
if not loc.exists():
|
if not loc.exists():
|
||||||
print "Warning: Probabilities file not found"
|
print("Warning: Probabilities file not found")
|
||||||
return {}
|
return {}
|
||||||
probs = {}
|
probs = {}
|
||||||
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
|
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
|
||||||
|
@ -85,7 +87,7 @@ def _read_probs(loc):
|
||||||
def _read_senses(loc):
|
def _read_senses(loc):
|
||||||
lexicon = defaultdict(lambda: defaultdict(list))
|
lexicon = defaultdict(lambda: defaultdict(list))
|
||||||
if not loc.exists():
|
if not loc.exists():
|
||||||
print "Warning: WordNet senses not found"
|
print("Warning: WordNet senses not found")
|
||||||
return lexicon
|
return lexicon
|
||||||
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
|
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
|
||||||
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
|
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
|
||||||
|
@ -109,7 +111,7 @@ def setup_vocab(src_dir, dst_dir):
|
||||||
if vectors_src.exists():
|
if vectors_src.exists():
|
||||||
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
|
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
|
||||||
else:
|
else:
|
||||||
print "Warning: Word vectors file not found"
|
print("Warning: Word vectors file not found")
|
||||||
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
|
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
|
||||||
clusters = _read_clusters(src_dir / 'clusters.txt')
|
clusters = _read_clusters(src_dir / 'clusters.txt')
|
||||||
probs = _read_probs(src_dir / 'words.sgt.prob')
|
probs = _read_probs(src_dir / 'words.sgt.prob')
|
||||||
|
@ -143,7 +145,7 @@ def main(lang_data_dir, corpora_dir, model_dir):
|
||||||
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
|
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
|
||||||
setup_vocab(corpora_dir, model_dir / 'vocab')
|
setup_vocab(corpora_dir, model_dir / 'vocab')
|
||||||
if not (model_dir / 'wordnet').exists():
|
if not (model_dir / 'wordnet').exists():
|
||||||
copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))
|
copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in New Issue
Block a user