* Update init_model, making language resources optional

2025-09-18 18:12:45 +03:00 · 2015-07-22 00:25:14 +02:00 · 2015-07-22 00:25:14 +02:00 · 386246db5b
commit 386246db5b
parent 424854028f
1 changed files with 11 additions and 0 deletions
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -44,6 +44,9 @@ def setup_tokenizer(lang_data_dir, tok_dir):
 def _read_clusters(loc):
    if not loc.exists():
        print "Warning: Clusters file not found"
        return {}
    clusters = {}
    for line in codecs.open(str(loc), 'r', 'utf8'):
        try:
@ -68,6 +71,9 @@ def _read_clusters(loc):
 def _read_probs(loc):
    if not loc.exists():
        print "Warning: Probabilities file not found"
        return {}
    probs = {}
    for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
        prob, word = line.split()
@ -78,6 +84,9 @@ def _read_probs(loc):
 def _read_senses(loc):
    lexicon = defaultdict(lambda: defaultdict(list))
    if not loc.exists():
        print "Warning: WordNet senses not found"
        return lexicon
    sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
    pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
    for line in codecs.open(str(loc), 'r', 'utf8'):
@ -99,6 +108,8 @@ def setup_vocab(src_dir, dst_dir):
    vectors_src = src_dir / 'vectors.tgz'
    if vectors_src.exists():
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    else:
        print "Warning: Word vectors file not found"
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    probs = _read_probs(src_dir / 'words.sgt.prob')