Fix formatting and typo (closes #967)

This commit is contained in:
Ines Montani 2017-04-16 23:56:12 +02:00 committed by GitHub
parent 734b0a4e4a
commit e7ae3b7cc2

View File

@ -1,7 +1,8 @@
'''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
"""
Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
text, with each "sentence" on a newline, and spaces between tokens. Supports
multi-processing.
'''
"""
from __future__ import print_function, unicode_literals, division
import io
import bz2
@ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra):
def iter_texts_from_json_bz2(loc):
'''
"""
Iterator of unicode strings, one per document (here, a comment).
Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
document text should be in a string field titled 'body'.
This is the data format of the Reddit comments corpus.
'''
"""
with bz2.BZ2File(loc) as file_:
for i, line in enumerate(file_):
yield ujson.loads(line)['body']
@ -80,7 +81,7 @@ def is_sent_begin(word):
def main(in_loc, out_dir, n_workers=4, batch_size=100000):
if not path.exists(out_dir):
path.join(out_dir)
texts = partition(batch_size, iter_texts(in_loc))
texts = partition(batch_size, iter_texts_from_json_bz2(in_loc))
parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])