Merge remote-tracking branch 'refs/remotes/honnibal/master'

This commit is contained in:
maxirmx 2015-10-16 11:59:57 +03:00
commit 6de26d312c
2 changed files with 16 additions and 29 deletions

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python
from __future__ import unicode_literals
from __future__ import unicode_literals, print_function
import plac
import joblib
@ -12,9 +12,11 @@ import codecs
from preshed.counter import PreshCounter
from joblib import Parallel, delayed
import spacy.en
from spacy.en import English
from spacy.strings import StringStore
from spacy.en.attrs import ORTH
from spacy.attrs import ORTH
from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab
def iter_comments(loc):
@ -23,37 +25,22 @@ def iter_comments(loc):
yield ujson.loads(line)
def null_props(string):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string,
'suffix': string,
'cluster': 0,
'prob': -22,
'sentiment': 0
}
def count_freqs(input_loc, output_loc):
print output_loc
nlp = spacy.en.English(Parser=None, Tagger=None, Entity=None, load_vectors=False)
nlp.vocab.lexeme_props_getter = null_props
print(output_loc)
vocab = English.default_vocab(get_lex_attr=None)
tokenizer = Tokenizer.from_dir(vocab,
path.join(English.default_data_dir(), 'tokenizer'))
counts = PreshCounter()
tokenizer = nlp.tokenizer
for json_comment in iter_comments(input_loc):
doc = tokenizer(json_comment['body'])
doc.count_by(ORTH, counts=counts)
with codecs.open(output_loc, 'w', 'utf8') as file_:
for orth, freq in counts:
string = nlp.vocab.strings[orth]
file_.write('%d\t%s\n' % (freq, repr(string)))
string = tokenizer.vocab.strings[orth]
if not string.isspace():
file_.write('%d\t%s\n' % (freq, string))
def parallelize(func, iterator, n_jobs):
@ -64,12 +51,12 @@ def merge_counts(locs, out_loc):
string_map = StringStore()
counts = PreshCounter()
for loc in locs:
with codecs.open(loc, 'r', 'utf8') as file_:
with io.open(loc, 'r', encoding='utf8') as file_:
for line in file_:
freq, word = line.strip().split('\t', 1)
orth = string_map[word]
counts.inc(orth, int(freq))
with codecs.open(out_loc, 'w', 'utf8') as file_:
with io.open(out_loc, 'w', encoding='utf8') as file_:
for orth, count in counts:
string = string_map[orth]
file_.write('%d\t%s\n' % (count, string))
@ -98,7 +85,7 @@ def main(input_loc, freqs_dir, output_loc, n_jobs=2, skip_existing=False):
if tasks:
parallelize(count_freqs, tasks, n_jobs)
print "Merge"
print("Merge")
merge_counts(outputs, output_loc)

View File

@ -179,7 +179,7 @@ VERSION = '0.95'
def main(modules, is_pypy):
language = "cpp"
includes = ['.', path.join(sys.prefix, 'include')]
if sys.prefix == 'darwin':
if sys.platform.startswith('darwin'):
compile_options['other'].append(['-mmacosx-version-min=10.8', '-stdlib=libc++'])
link_opions['other'].append('-lc++')
if use_cython: