Fix formatting and typo (closes #967)

2026-03-05 20:31:30 +03:00 · 2017-04-16 23:56:12 +02:00 · 2017-04-16 23:56:12 +02:00 · e7ae3b7cc2
commit e7ae3b7cc2
parent 734b0a4e4a
1 changed files with 6 additions and 5 deletions
--- a/examples/pos_tag.py
+++ b/examples/pos_tag.py
@ -1,7 +1,8 @@
-'''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
+"""
+Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
 text, with each "sentence" on a newline, and spaces between tokens. Supports
 multi-processing.
-'''
+"""
 from __future__ import print_function, unicode_literals, division
 import io
 import bz2
@ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra):


 def iter_texts_from_json_bz2(loc):
-    '''
+    """
    Iterator of unicode strings, one per document (here, a comment).
    
    Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
    document text should be in a string field titled 'body'.

    This is the data format of the Reddit comments corpus.
-    '''
+    """
    with bz2.BZ2File(loc) as file_:
        for i, line in enumerate(file_):
            yield ujson.loads(line)['body']
@ -80,7 +81,7 @@ def is_sent_begin(word):
 def main(in_loc, out_dir, n_workers=4, batch_size=100000):
    if not path.exists(out_dir):
        path.join(out_dir)
-    texts = partition(batch_size, iter_texts(in_loc))
+    texts = partition(batch_size, iter_texts_from_json_bz2(in_loc))
    parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])