mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-29 23:17:59 +03:00 
			
		
		
		
	Merge branch 'refactor' of ssh://github.com/honnibal/spaCy into refactor
This commit is contained in:
		
						commit
						6cfa83157e
					
				|  | @ -40,8 +40,7 @@ def null_props(string): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def count_freqs(input_loc, output_loc): | def count_freqs(input_loc, output_loc): | ||||||
|     nlp = spacy.en.English(data_dir=os.environ['SPACY_DATA'], Parser=None, |     nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False) | ||||||
|                            Tagger=None, Entity=None, load_vectors=False) |  | ||||||
|     nlp.vocab.lexeme_props_getter = null_props |     nlp.vocab.lexeme_props_getter = null_props | ||||||
| 
 | 
 | ||||||
|     counts = PreshCounter() |     counts = PreshCounter() | ||||||
|  | @ -76,15 +75,17 @@ def merge_counts(locs, out_loc): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( | @plac.annotations( | ||||||
|     input_dir=("Directory of input files"), |     input_loc=("Location of input file list"), | ||||||
|     freqs_dir=("Directory for frequency files"), |     freqs_dir=("Directory for frequency files"), | ||||||
|     output_loc=("Location for output file"), |     output_loc=("Location for output file"), | ||||||
|     n_jobs=("Number of workers", "option", "n", int), |     n_jobs=("Number of workers", "option", "n", int), | ||||||
| ) | ) | ||||||
| def main(input_dir, freqs_dir, output_loc, n_jobs=2): | def main(input_loc, freqs_dir, output_loc, n_jobs=2): | ||||||
|     tasks = [] |     tasks = [] | ||||||
|     for filename in os.listdir(input_dir): |     for input_path in open(input_loc): | ||||||
|         input_path = path.join(input_dir, filename) |         input_path = input_path.strip() | ||||||
|  |         if not input_path: continue | ||||||
|  |         filename = input_path.split('/')[-1] | ||||||
|         output_path = path.join(freqs_dir, filename.replace('bz2', 'freq')) |         output_path = path.join(freqs_dir, filename.replace('bz2', 'freq')) | ||||||
|         tasks.append((input_path, output_path)) |         tasks.append((input_path, output_path)) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -30,8 +30,6 @@ from spacy.vocab import write_binary_vectors | ||||||
| 
 | 
 | ||||||
| from spacy.parts_of_speech import NOUN, VERB, ADJ | from spacy.parts_of_speech import NOUN, VERB, ADJ | ||||||
| 
 | 
 | ||||||
| import spacy.senses |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| def setup_tokenizer(lang_data_dir, tok_dir): | def setup_tokenizer(lang_data_dir, tok_dir): | ||||||
|     if not tok_dir.exists(): |     if not tok_dir.exists(): | ||||||
|  | @ -103,11 +101,7 @@ def setup_vocab(src_dir, dst_dir): | ||||||
|         write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) |         write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) | ||||||
|     vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) |     vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) | ||||||
|     clusters = _read_clusters(src_dir / 'clusters.txt') |     clusters = _read_clusters(src_dir / 'clusters.txt') | ||||||
|     senses = _read_senses(src_dir / 'supersenses.txt') |  | ||||||
|     probs = _read_probs(src_dir / 'words.sgt.prob') |     probs = _read_probs(src_dir / 'words.sgt.prob') | ||||||
|     for word in set(clusters).union(set(senses)): |  | ||||||
|         if word not in probs: |  | ||||||
|             probs[word] = -17.0 |  | ||||||
|     lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ) |     lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ) | ||||||
|     lexicon = [] |     lexicon = [] | ||||||
|     for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): |     for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): | ||||||
|  | @ -120,15 +114,6 @@ def setup_vocab(src_dir, dst_dir): | ||||||
|             entry['cluster'] = int(cluster[::-1], 2) |             entry['cluster'] = int(cluster[::-1], 2) | ||||||
|             orth_senses = set() |             orth_senses = set() | ||||||
|             lemmas = [] |             lemmas = [] | ||||||
|             for pos in [NOUN, VERB, ADJ]: |  | ||||||
|                 for lemma in lemmatizer(word.lower(), pos): |  | ||||||
|                     lemmas.append(lemma) |  | ||||||
|                     orth_senses.update(senses[lemma][pos]) |  | ||||||
|             if word.lower() == 'dogging': |  | ||||||
|                 print word |  | ||||||
|                 print lemmas |  | ||||||
|                 print [spacy.senses.STRINGS[si] for si in orth_senses] |  | ||||||
|             entry['senses'] = list(sorted(orth_senses)) |  | ||||||
|             vocab[word] = entry |             vocab[word] = entry | ||||||
|     vocab.dump(str(dst_dir / 'lexemes.bin')) |     vocab.dump(str(dst_dir / 'lexemes.bin')) | ||||||
|     vocab.strings.dump(str(dst_dir / 'strings.txt')) |     vocab.strings.dump(str(dst_dir / 'strings.txt')) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user