* Fix structure of wordnet directory for init_model

This commit is contained in:
Matthew Honnibal 2015-07-23 06:35:38 +02:00
parent 16617142b7
commit 4af2595d99

View File

@ -15,6 +15,8 @@ Requires:
* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
* vectors.tgz --- output of something like word2vec
"""
from __future__ import unicode_literals
import plac
from pathlib import Path
@ -45,7 +47,7 @@ def setup_tokenizer(lang_data_dir, tok_dir):
def _read_clusters(loc):
if not loc.exists():
print "Warning: Clusters file not found"
print("Warning: Clusters file not found")
return {}
clusters = {}
for line in codecs.open(str(loc), 'r', 'utf8'):
@ -72,7 +74,7 @@ def _read_clusters(loc):
def _read_probs(loc):
if not loc.exists():
print "Warning: Probabilities file not found"
print("Warning: Probabilities file not found")
return {}
probs = {}
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
@ -85,7 +87,7 @@ def _read_probs(loc):
def _read_senses(loc):
lexicon = defaultdict(lambda: defaultdict(list))
if not loc.exists():
print "Warning: WordNet senses not found"
print("Warning: WordNet senses not found")
return lexicon
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
@ -109,7 +111,7 @@ def setup_vocab(src_dir, dst_dir):
if vectors_src.exists():
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
else:
print "Warning: Word vectors file not found"
print("Warning: Word vectors file not found")
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
clusters = _read_clusters(src_dir / 'clusters.txt')
probs = _read_probs(src_dir / 'words.sgt.prob')
@ -143,7 +145,7 @@ def main(lang_data_dir, corpora_dir, model_dir):
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
setup_vocab(corpora_dir, model_dir / 'vocab')
if not (model_dir / 'wordnet').exists():
copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))
copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))
if __name__ == '__main__':