Merge branch 'refactor' of ssh://github.com/honnibal/spaCy into refactor

This commit is contained in:
Matthew Honnibal 2015-07-17 21:38:04 +02:00
commit 6cfa83157e
2 changed files with 7 additions and 21 deletions

View File

@ -40,8 +40,7 @@ def null_props(string):
def count_freqs(input_loc, output_loc):
nlp = spacy.en.English(data_dir=os.environ['SPACY_DATA'], Parser=None,
Tagger=None, Entity=None, load_vectors=False)
nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False)
nlp.vocab.lexeme_props_getter = null_props
counts = PreshCounter()
@ -76,15 +75,17 @@ def merge_counts(locs, out_loc):
@plac.annotations(
input_dir=("Directory of input files"),
input_loc=("Location of input file list"),
freqs_dir=("Directory for frequency files"),
output_loc=("Location for output file"),
n_jobs=("Number of workers", "option", "n", int),
)
def main(input_dir, freqs_dir, output_loc, n_jobs=2):
def main(input_loc, freqs_dir, output_loc, n_jobs=2):
tasks = []
for filename in os.listdir(input_dir):
input_path = path.join(input_dir, filename)
for input_path in open(input_loc):
input_path = input_path.strip()
if not input_path: continue
filename = input_path.split('/')[-1]
output_path = path.join(freqs_dir, filename.replace('bz2', 'freq'))
tasks.append((input_path, output_path))

View File

@ -30,8 +30,6 @@ from spacy.vocab import write_binary_vectors
from spacy.parts_of_speech import NOUN, VERB, ADJ
import spacy.senses
def setup_tokenizer(lang_data_dir, tok_dir):
if not tok_dir.exists():
@ -103,11 +101,7 @@ def setup_vocab(src_dir, dst_dir):
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
clusters = _read_clusters(src_dir / 'clusters.txt')
senses = _read_senses(src_dir / 'supersenses.txt')
probs = _read_probs(src_dir / 'words.sgt.prob')
for word in set(clusters).union(set(senses)):
if word not in probs:
probs[word] = -17.0
lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
lexicon = []
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
@ -120,15 +114,6 @@ def setup_vocab(src_dir, dst_dir):
entry['cluster'] = int(cluster[::-1], 2)
orth_senses = set()
lemmas = []
for pos in [NOUN, VERB, ADJ]:
for lemma in lemmatizer(word.lower(), pos):
lemmas.append(lemma)
orth_senses.update(senses[lemma][pos])
if word.lower() == 'dogging':
print word
print lemmas
print [spacy.senses.STRINGS[si] for si in orth_senses]
entry['senses'] = list(sorted(orth_senses))
vocab[word] = entry
vocab.dump(str(dst_dir / 'lexemes.bin'))
vocab.strings.dump(str(dst_dir / 'strings.txt'))