Merge branch 'refactor' of ssh://github.com/honnibal/spaCy into refactor

2025-12-23 01:53:17 +03:00 · 2015-07-17 21:38:04 +02:00 · 2015-07-17 21:38:04 +02:00 · 6cfa83157e
commit 6cfa83157e
parent f7f0ad1a78 af54d05d60
2 changed files with 7 additions and 21 deletions
--- a/bin/get_freqs.py
+++ b/bin/get_freqs.py
@ -40,8 +40,7 @@ def null_props(string):
 def count_freqs(input_loc, output_loc):
-    nlp = spacy.en.English(data_dir=os.environ['SPACY_DATA'], Parser=None,
+    nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False)
                           Tagger=None, Entity=None, load_vectors=False)
    nlp.vocab.lexeme_props_getter = null_props
    counts = PreshCounter()
@ -76,15 +75,17 @@ def merge_counts(locs, out_loc):
@plac.annotations(
-    input_dir=("Directory of input files"),
+    input_loc=("Location of input file list"),
    freqs_dir=("Directory for frequency files"),
    output_loc=("Location for output file"),
    n_jobs=("Number of workers", "option", "n", int),
 )
-def main(input_dir, freqs_dir, output_loc, n_jobs=2):
+def main(input_loc, freqs_dir, output_loc, n_jobs=2):
    tasks = []
-    for filename in os.listdir(input_dir):
+    for input_path in open(input_loc):
-        input_path = path.join(input_dir, filename)
+        input_path = input_path.strip()
        if not input_path: continue
        filename = input_path.split('/')[-1]
        output_path = path.join(freqs_dir, filename.replace('bz2', 'freq'))
        tasks.append((input_path, output_path))
--- a/bin/init_model.py
+++ b/bin/init_model.py
@ -30,8 +30,6 @@ from spacy.vocab import write_binary_vectors
 from spacy.parts_of_speech import NOUN, VERB, ADJ
 import spacy.senses
 def setup_tokenizer(lang_data_dir, tok_dir):
    if not tok_dir.exists():
@ -103,11 +101,7 @@ def setup_vocab(src_dir, dst_dir):
        write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
    vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
    clusters = _read_clusters(src_dir / 'clusters.txt')
    senses = _read_senses(src_dir / 'supersenses.txt')
    probs = _read_probs(src_dir / 'words.sgt.prob')
    for word in set(clusters).union(set(senses)):
        if word not in probs:
            probs[word] = -17.0
    lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
    lexicon = []
    for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
@ -120,15 +114,6 @@ def setup_vocab(src_dir, dst_dir):
            entry['cluster'] = int(cluster[::-1], 2)
            orth_senses = set()
            lemmas = []
            for pos in [NOUN, VERB, ADJ]:
                for lemma in lemmatizer(word.lower(), pos):
                    lemmas.append(lemma)
                    orth_senses.update(senses[lemma][pos])
            if word.lower() == 'dogging':
                print word
                print lemmas
                print [spacy.senses.STRINGS[si] for si in orth_senses]
            entry['senses'] = list(sorted(orth_senses))
            vocab[word] = entry
    vocab.dump(str(dst_dir / 'lexemes.bin'))
    vocab.strings.dump(str(dst_dir / 'strings.txt'))