mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Merge branch 'refactor' of ssh://github.com/honnibal/spaCy into refactor
This commit is contained in:
		
						commit
						6cfa83157e
					
				|  | @ -40,8 +40,7 @@ def null_props(string): | |||
| 
 | ||||
| 
 | ||||
| def count_freqs(input_loc, output_loc): | ||||
|     nlp = spacy.en.English(data_dir=os.environ['SPACY_DATA'], Parser=None, | ||||
|                            Tagger=None, Entity=None, load_vectors=False) | ||||
|     nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False) | ||||
|     nlp.vocab.lexeme_props_getter = null_props | ||||
| 
 | ||||
|     counts = PreshCounter() | ||||
|  | @ -76,15 +75,17 @@ def merge_counts(locs, out_loc): | |||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     input_dir=("Directory of input files"), | ||||
|     input_loc=("Location of input file list"), | ||||
|     freqs_dir=("Directory for frequency files"), | ||||
|     output_loc=("Location for output file"), | ||||
|     n_jobs=("Number of workers", "option", "n", int), | ||||
| ) | ||||
| def main(input_dir, freqs_dir, output_loc, n_jobs=2): | ||||
| def main(input_loc, freqs_dir, output_loc, n_jobs=2): | ||||
|     tasks = [] | ||||
|     for filename in os.listdir(input_dir): | ||||
|         input_path = path.join(input_dir, filename) | ||||
|     for input_path in open(input_loc): | ||||
|         input_path = input_path.strip() | ||||
|         if not input_path: continue | ||||
|         filename = input_path.split('/')[-1] | ||||
|         output_path = path.join(freqs_dir, filename.replace('bz2', 'freq')) | ||||
|         tasks.append((input_path, output_path)) | ||||
| 
 | ||||
|  |  | |||
|  | @ -30,8 +30,6 @@ from spacy.vocab import write_binary_vectors | |||
| 
 | ||||
| from spacy.parts_of_speech import NOUN, VERB, ADJ | ||||
| 
 | ||||
| import spacy.senses | ||||
| 
 | ||||
| 
 | ||||
| def setup_tokenizer(lang_data_dir, tok_dir): | ||||
|     if not tok_dir.exists(): | ||||
|  | @ -103,11 +101,7 @@ def setup_vocab(src_dir, dst_dir): | |||
|         write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) | ||||
|     vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) | ||||
|     clusters = _read_clusters(src_dir / 'clusters.txt') | ||||
|     senses = _read_senses(src_dir / 'supersenses.txt') | ||||
|     probs = _read_probs(src_dir / 'words.sgt.prob') | ||||
|     for word in set(clusters).union(set(senses)): | ||||
|         if word not in probs: | ||||
|             probs[word] = -17.0 | ||||
|     lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ) | ||||
|     lexicon = [] | ||||
|     for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): | ||||
|  | @ -120,15 +114,6 @@ def setup_vocab(src_dir, dst_dir): | |||
|             entry['cluster'] = int(cluster[::-1], 2) | ||||
|             orth_senses = set() | ||||
|             lemmas = [] | ||||
|             for pos in [NOUN, VERB, ADJ]: | ||||
|                 for lemma in lemmatizer(word.lower(), pos): | ||||
|                     lemmas.append(lemma) | ||||
|                     orth_senses.update(senses[lemma][pos]) | ||||
|             if word.lower() == 'dogging': | ||||
|                 print word | ||||
|                 print lemmas | ||||
|                 print [spacy.senses.STRINGS[si] for si in orth_senses] | ||||
|             entry['senses'] = list(sorted(orth_senses)) | ||||
|             vocab[word] = entry | ||||
|     vocab.dump(str(dst_dir / 'lexemes.bin')) | ||||
|     vocab.strings.dump(str(dst_dir / 'strings.txt')) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user