* Remove sense stuff from init_model

This commit is contained in:
Matthew Honnibal 2015-07-14 10:56:17 +02:00
parent 3de1b3ef1d
commit af54d05d60

View File

@ -30,8 +30,6 @@ from spacy.vocab import write_binary_vectors
from spacy.parts_of_speech import NOUN, VERB, ADJ from spacy.parts_of_speech import NOUN, VERB, ADJ
import spacy.senses
def setup_tokenizer(lang_data_dir, tok_dir): def setup_tokenizer(lang_data_dir, tok_dir):
if not tok_dir.exists(): if not tok_dir.exists():
@ -103,11 +101,7 @@ def setup_vocab(src_dir, dst_dir):
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
clusters = _read_clusters(src_dir / 'clusters.txt') clusters = _read_clusters(src_dir / 'clusters.txt')
senses = _read_senses(src_dir / 'supersenses.txt')
probs = _read_probs(src_dir / 'words.sgt.prob') probs = _read_probs(src_dir / 'words.sgt.prob')
for word in set(clusters).union(set(senses)):
if word not in probs:
probs[word] = -17.0
lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ) lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
lexicon = [] lexicon = []
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
@ -120,15 +114,6 @@ def setup_vocab(src_dir, dst_dir):
entry['cluster'] = int(cluster[::-1], 2) entry['cluster'] = int(cluster[::-1], 2)
orth_senses = set() orth_senses = set()
lemmas = [] lemmas = []
for pos in [NOUN, VERB, ADJ]:
for lemma in lemmatizer(word.lower(), pos):
lemmas.append(lemma)
orth_senses.update(senses[lemma][pos])
if word.lower() == 'dogging':
print word
print lemmas
print [spacy.senses.STRINGS[si] for si in orth_senses]
entry['senses'] = list(sorted(orth_senses))
vocab[word] = entry vocab[word] = entry
vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.dump(str(dst_dir / 'lexemes.bin'))
vocab.strings.dump(str(dst_dir / 'strings.txt')) vocab.strings.dump(str(dst_dir / 'strings.txt'))