mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Remove sense stuff from init_model
This commit is contained in:
		
							parent
							
								
									3de1b3ef1d
								
							
						
					
					
						commit
						af54d05d60
					
				| 
						 | 
				
			
			@ -30,8 +30,6 @@ from spacy.vocab import write_binary_vectors
 | 
			
		|||
 | 
			
		||||
from spacy.parts_of_speech import NOUN, VERB, ADJ
 | 
			
		||||
 | 
			
		||||
import spacy.senses
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def setup_tokenizer(lang_data_dir, tok_dir):
 | 
			
		||||
    if not tok_dir.exists():
 | 
			
		||||
| 
						 | 
				
			
			@ -103,11 +101,7 @@ def setup_vocab(src_dir, dst_dir):
 | 
			
		|||
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
 | 
			
		||||
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
 | 
			
		||||
    clusters = _read_clusters(src_dir / 'clusters.txt')
 | 
			
		||||
    senses = _read_senses(src_dir / 'supersenses.txt')
 | 
			
		||||
    probs = _read_probs(src_dir / 'words.sgt.prob')
 | 
			
		||||
    for word in set(clusters).union(set(senses)):
 | 
			
		||||
        if word not in probs:
 | 
			
		||||
            probs[word] = -17.0
 | 
			
		||||
    lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
 | 
			
		||||
    lexicon = []
 | 
			
		||||
    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
 | 
			
		||||
| 
						 | 
				
			
			@ -120,15 +114,6 @@ def setup_vocab(src_dir, dst_dir):
 | 
			
		|||
            entry['cluster'] = int(cluster[::-1], 2)
 | 
			
		||||
            orth_senses = set()
 | 
			
		||||
            lemmas = []
 | 
			
		||||
            for pos in [NOUN, VERB, ADJ]:
 | 
			
		||||
                for lemma in lemmatizer(word.lower(), pos):
 | 
			
		||||
                    lemmas.append(lemma)
 | 
			
		||||
                    orth_senses.update(senses[lemma][pos])
 | 
			
		||||
            if word.lower() == 'dogging':
 | 
			
		||||
                print word
 | 
			
		||||
                print lemmas
 | 
			
		||||
                print [spacy.senses.STRINGS[si] for si in orth_senses]
 | 
			
		||||
            entry['senses'] = list(sorted(orth_senses))
 | 
			
		||||
            vocab[word] = entry
 | 
			
		||||
    vocab.dump(str(dst_dir / 'lexemes.bin'))
 | 
			
		||||
    vocab.strings.dump(str(dst_dir / 'strings.txt'))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user