mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Fix formatting and typo (closes #967)
This commit is contained in:
parent
734b0a4e4a
commit
e7ae3b7cc2
|
@ -1,7 +1,8 @@
|
||||||
'''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
|
"""
|
||||||
|
Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
|
||||||
text, with each "sentence" on a newline, and spaces between tokens. Supports
|
text, with each "sentence" on a newline, and spaces between tokens. Supports
|
||||||
multi-processing.
|
multi-processing.
|
||||||
'''
|
"""
|
||||||
from __future__ import print_function, unicode_literals, division
|
from __future__ import print_function, unicode_literals, division
|
||||||
import io
|
import io
|
||||||
import bz2
|
import bz2
|
||||||
|
@ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra):
|
||||||
|
|
||||||
|
|
||||||
def iter_texts_from_json_bz2(loc):
|
def iter_texts_from_json_bz2(loc):
|
||||||
'''
|
"""
|
||||||
Iterator of unicode strings, one per document (here, a comment).
|
Iterator of unicode strings, one per document (here, a comment).
|
||||||
|
|
||||||
Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
|
Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
|
||||||
document text should be in a string field titled 'body'.
|
document text should be in a string field titled 'body'.
|
||||||
|
|
||||||
This is the data format of the Reddit comments corpus.
|
This is the data format of the Reddit comments corpus.
|
||||||
'''
|
"""
|
||||||
with bz2.BZ2File(loc) as file_:
|
with bz2.BZ2File(loc) as file_:
|
||||||
for i, line in enumerate(file_):
|
for i, line in enumerate(file_):
|
||||||
yield ujson.loads(line)['body']
|
yield ujson.loads(line)['body']
|
||||||
|
@ -80,7 +81,7 @@ def is_sent_begin(word):
|
||||||
def main(in_loc, out_dir, n_workers=4, batch_size=100000):
|
def main(in_loc, out_dir, n_workers=4, batch_size=100000):
|
||||||
if not path.exists(out_dir):
|
if not path.exists(out_dir):
|
||||||
path.join(out_dir)
|
path.join(out_dir)
|
||||||
texts = partition(batch_size, iter_texts(in_loc))
|
texts = partition(batch_size, iter_texts_from_json_bz2(in_loc))
|
||||||
parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
|
parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user