Merge remote-tracking branch 'refs/remotes/honnibal/master'

This commit is contained in:
maxirmx 2015-10-16 11:59:57 +03:00
commit 6de26d312c
2 changed files with 16 additions and 29 deletions

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
from __future__ import unicode_literals from __future__ import unicode_literals, print_function
import plac import plac
import joblib import joblib
@ -12,9 +12,11 @@ import codecs
from preshed.counter import PreshCounter from preshed.counter import PreshCounter
from joblib import Parallel, delayed from joblib import Parallel, delayed
import spacy.en from spacy.en import English
from spacy.strings import StringStore from spacy.strings import StringStore
from spacy.en.attrs import ORTH from spacy.attrs import ORTH
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
def iter_comments(loc): def iter_comments(loc):
@ -23,37 +25,22 @@ def iter_comments(loc):
yield ujson.loads(line) yield ujson.loads(line)
def null_props(string):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string,
'suffix': string,
'cluster': 0,
'prob': -22,
'sentiment': 0
}
def count_freqs(input_loc, output_loc): def count_freqs(input_loc, output_loc):
print output_loc print(output_loc)
nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False) vocab = English.default_vocab(get_lex_attr=None)
nlp.vocab.lexeme_props_getter = null_props tokenizer = Tokenizer.from_dir(vocab,
path.join(English.default_data_dir(), 'tokenizer'))
counts = PreshCounter() counts = PreshCounter()
tokenizer = nlp.tokenizer
for json_comment in iter_comments(input_loc): for json_comment in iter_comments(input_loc):
doc = tokenizer(json_comment['body']) doc = tokenizer(json_comment['body'])
doc.count_by(ORTH, counts=counts) doc.count_by(ORTH, counts=counts)
with codecs.open(output_loc, 'w', 'utf8') as file_: with codecs.open(output_loc, 'w', 'utf8') as file_:
for orth, freq in counts: for orth, freq in counts:
string = nlp.vocab.strings[orth] string = tokenizer.vocab.strings[orth]
file_.write('%d\t%s\n' % (freq, repr(string))) if not string.isspace():
file_.write('%d\t%s\n' % (freq, string))
def parallelize(func, iterator, n_jobs): def parallelize(func, iterator, n_jobs):
@ -64,12 +51,12 @@ def merge_counts(locs, out_loc):
string_map = StringStore() string_map = StringStore()
counts = PreshCounter() counts = PreshCounter()
for loc in locs: for loc in locs:
with codecs.open(loc, 'r', 'utf8') as file_: with io.open(loc, 'r', encoding='utf8') as file_:
for line in file_: for line in file_:
freq, word = line.strip().split('\t', 1) freq, word = line.strip().split('\t', 1)
orth = string_map[word] orth = string_map[word]
counts.inc(orth, int(freq)) counts.inc(orth, int(freq))
with codecs.open(out_loc, 'w', 'utf8') as file_: with io.open(out_loc, 'w', encoding='utf8') as file_:
for orth, count in counts: for orth, count in counts:
string = string_map[orth] string = string_map[orth]
file_.write('%d\t%s\n' % (count, string)) file_.write('%d\t%s\n' % (count, string))
@ -98,7 +85,7 @@ def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
if tasks: if tasks:
parallelize(count_freqs, tasks, n_jobs) parallelize(count_freqs, tasks, n_jobs)
print "Merge" print("Merge")
merge_counts(outputs, output_loc) merge_counts(outputs, output_loc)

View File

@ -179,7 +179,7 @@ VERSION = '0.95'
def main(modules, is_pypy): def main(modules, is_pypy):
language = "cpp" language = "cpp"
includes = ['.', path.join(sys.prefix, 'include')] includes = ['.', path.join(sys.prefix, 'include')]
if sys.prefix == 'darwin': if sys.platform.startswith('darwin'):
compile_options['other'].append(['-mmacosx-version-min=10.8', '-stdlib=libc++']) compile_options['other'].append(['-mmacosx-version-min=10.8', '-stdlib=libc++'])
link_opions['other'].append('-lc++') link_opions['other'].append('-lc++')
if use_cython: if use_cython: