mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Fix formatting and typo (closes #967)
This commit is contained in:
parent
734b0a4e4a
commit
e7ae3b7cc2
|
@ -1,7 +1,8 @@
|
|||
'''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
|
||||
"""
|
||||
Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
|
||||
text, with each "sentence" on a newline, and spaces between tokens. Supports
|
||||
multi-processing.
|
||||
'''
|
||||
"""
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
import io
|
||||
import bz2
|
||||
|
@ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra):
|
|||
|
||||
|
||||
def iter_texts_from_json_bz2(loc):
|
||||
'''
|
||||
"""
|
||||
Iterator of unicode strings, one per document (here, a comment).
|
||||
|
||||
Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
|
||||
document text should be in a string field titled 'body'.
|
||||
|
||||
This is the data format of the Reddit comments corpus.
|
||||
'''
|
||||
"""
|
||||
with bz2.BZ2File(loc) as file_:
|
||||
for i, line in enumerate(file_):
|
||||
yield ujson.loads(line)['body']
|
||||
|
@ -80,7 +81,7 @@ def is_sent_begin(word):
|
|||
def main(in_loc, out_dir, n_workers=4, batch_size=100000):
|
||||
if not path.exists(out_dir):
|
||||
path.join(out_dir)
|
||||
texts = partition(batch_size, iter_texts(in_loc))
|
||||
texts = partition(batch_size, iter_texts_from_json_bz2(in_loc))
|
||||
parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user