diff --git a/bin/get_freqs.py b/bin/get_freqs.py index c9f858b22..e126a2eee 100755 --- a/bin/get_freqs.py +++ b/bin/get_freqs.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function import plac import joblib @@ -12,9 +12,11 @@ import codecs from preshed.counter import PreshCounter from joblib import Parallel, delayed -import spacy.en +from spacy.en import English from spacy.strings import StringStore -from spacy.en.attrs import ORTH +from spacy.attrs import ORTH +from spacy.tokenizer import Tokenizer +from spacy.vocab import Vocab def iter_comments(loc): @@ -23,37 +25,22 @@ def iter_comments(loc): yield ujson.loads(line) -def null_props(string): - return { - 'flags': 0, - 'length': len(string), - 'orth': string, - 'lower': string, - 'norm': string, - 'shape': string, - 'prefix': string, - 'suffix': string, - 'cluster': 0, - 'prob': -22, - 'sentiment': 0 - } - - def count_freqs(input_loc, output_loc): - print output_loc - nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False) - nlp.vocab.lexeme_props_getter = null_props + print(output_loc) + vocab = English.default_vocab(get_lex_attr=None) + tokenizer = Tokenizer.from_dir(vocab, + path.join(English.default_data_dir(), 'tokenizer')) counts = PreshCounter() - tokenizer = nlp.tokenizer for json_comment in iter_comments(input_loc): doc = tokenizer(json_comment['body']) doc.count_by(ORTH, counts=counts) with codecs.open(output_loc, 'w', 'utf8') as file_: for orth, freq in counts: - string = nlp.vocab.strings[orth] - file_.write('%d\t%s\n' % (freq, repr(string))) + string = tokenizer.vocab.strings[orth] + if not string.isspace(): + file_.write('%d\t%s\n' % (freq, string)) def parallelize(func, iterator, n_jobs): @@ -64,12 +51,12 @@ def merge_counts(locs, out_loc): string_map = StringStore() counts = PreshCounter() for loc in locs: - with codecs.open(loc, 'r', 'utf8') as file_: + with io.open(loc, 'r', encoding='utf8') as file_: for line in file_: freq, word = line.strip().split('\t', 1) orth = string_map[word] counts.inc(orth, int(freq)) - with codecs.open(out_loc, 'w', 'utf8') as file_: + with io.open(out_loc, 'w', encoding='utf8') as file_: for orth, count in counts: string = string_map[orth] file_.write('%d\t%s\n' % (count, string)) @@ -98,7 +85,7 @@ def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False): if tasks: parallelize(count_freqs, tasks, n_jobs) - print "Merge" + print("Merge") merge_counts(outputs, output_loc) diff --git a/setup.py b/setup.py index 21f9c6f53..f06be8104 100644 --- a/setup.py +++ b/setup.py @@ -179,7 +179,7 @@ VERSION = '0.95' def main(modules, is_pypy): language = "cpp" includes = ['.', path.join(sys.prefix, 'include')] - if sys.prefix == 'darwin': + if sys.platform.startswith('darwin'): compile_options['other'].append(['-mmacosx-version-min=10.8', '-stdlib=libc++']) link_opions['other'].append('-lc++') if use_cython: