diff --git a/examples/pos_tag.py b/examples/pos_tag.py index c61d29636..1dd6add0f 100644 --- a/examples/pos_tag.py +++ b/examples/pos_tag.py @@ -1,7 +1,8 @@ -'''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated +""" +Print part-of-speech tagged, true-cased, (very roughly) sentence-separated text, with each "sentence" on a newline, and spaces between tokens. Supports multi-processing. -''' +""" from __future__ import print_function, unicode_literals, division import io import bz2 @@ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra): def iter_texts_from_json_bz2(loc): - ''' + """ Iterator of unicode strings, one per document (here, a comment). Expects a a path to a BZ2 file, which should be new-line delimited JSON. The document text should be in a string field titled 'body'. This is the data format of the Reddit comments corpus. - ''' + """ with bz2.BZ2File(loc) as file_: for i, line in enumerate(file_): yield ujson.loads(line)['body'] @@ -80,7 +81,7 @@ def is_sent_begin(word): def main(in_loc, out_dir, n_workers=4, batch_size=100000): if not path.exists(out_dir): path.join(out_dir) - texts = partition(batch_size, iter_texts(in_loc)) + texts = partition(batch_size, iter_texts_from_json_bz2(in_loc)) parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])