diff --git a/bin/init_model.py b/bin/init_model.py index 1c06285ad..37b93c312 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -30,8 +30,6 @@ from spacy.vocab import write_binary_vectors from spacy.parts_of_speech import NOUN, VERB, ADJ -import spacy.senses - def setup_tokenizer(lang_data_dir, tok_dir): if not tok_dir.exists(): @@ -103,11 +101,7 @@ def setup_vocab(src_dir, dst_dir): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) clusters = _read_clusters(src_dir / 'clusters.txt') - senses = _read_senses(src_dir / 'supersenses.txt') probs = _read_probs(src_dir / 'words.sgt.prob') - for word in set(clusters).union(set(senses)): - if word not in probs: - probs[word] = -17.0 lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ) lexicon = [] for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): @@ -120,15 +114,6 @@ def setup_vocab(src_dir, dst_dir): entry['cluster'] = int(cluster[::-1], 2) orth_senses = set() lemmas = [] - for pos in [NOUN, VERB, ADJ]: - for lemma in lemmatizer(word.lower(), pos): - lemmas.append(lemma) - orth_senses.update(senses[lemma][pos]) - if word.lower() == 'dogging': - print word - print lemmas - print [spacy.senses.STRINGS[si] for si in orth_senses] - entry['senses'] = list(sorted(orth_senses)) vocab[word] = entry vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt'))