From 4af2595d99895c9b7c059368dfd1c51588517e87 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 23 Jul 2015 06:35:38 +0200 Subject: [PATCH] * Fix structure of wordnet directory for init_model --- bin/init_model.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index c97f1f5dd..d4e40d840 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -15,6 +15,8 @@ Requires: * clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters * vectors.tgz --- output of something like word2vec """ +from __future__ import unicode_literals + import plac from pathlib import Path @@ -45,7 +47,7 @@ def setup_tokenizer(lang_data_dir, tok_dir): def _read_clusters(loc): if not loc.exists(): - print "Warning: Clusters file not found" + print("Warning: Clusters file not found") return {} clusters = {} for line in codecs.open(str(loc), 'r', 'utf8'): @@ -72,7 +74,7 @@ def _read_clusters(loc): def _read_probs(loc): if not loc.exists(): - print "Warning: Probabilities file not found" + print("Warning: Probabilities file not found") return {} probs = {} for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')): @@ -85,7 +87,7 @@ def _read_probs(loc): def _read_senses(loc): lexicon = defaultdict(lambda: defaultdict(list)) if not loc.exists(): - print "Warning: WordNet senses not found" + print("Warning: WordNet senses not found") return lexicon sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS)) pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ} @@ -109,7 +111,7 @@ def setup_vocab(src_dir, dst_dir): if vectors_src.exists(): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) else: - print "Warning: Word vectors file not found" + print("Warning: Word vectors file not found") vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) clusters = _read_clusters(src_dir / 'clusters.txt') probs = _read_probs(src_dir / 'words.sgt.prob') @@ -143,7 +145,7 @@ def main(lang_data_dir, corpora_dir, model_dir): setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') setup_vocab(corpora_dir, model_dir / 'vocab') if not (model_dir / 'wordnet').exists(): - copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet')) + copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet')) if __name__ == '__main__':