addressing reviews

This commit is contained in:
kadarakos 2022-12-15 12:07:29 +00:00
parent 1efef3f216
commit e75722cd15

View File

@ -14,18 +14,15 @@ from ..vocab import Vocab
from ..util import ensure_path, load_model from ..util import ensure_path, load_model
path_help = ( path_help = """Location of the documents to predict on.
"Location of the documents to predict on. " Can be a single file in .spacy format or a .jsonl file.
"Can be a single file in .spacy format or a " Files with other extensions are treated as single plain text documents.
".jsonl file. Files with other extensions " If a directory is provided it is traversed recursively to grab
"are treated as single plain text documents. " all files to be processed.
"If a directory is provided " The files can be a mixture of .spacy, .jsonl and text files.
"it is traversed recursively to grab all files to " If .jsonl is provided the specified field is going
"be processed. The files can be a mixture of .spacy, " to be grabbed ("text" by default)."""
".jsonl and text files. If .jsonl is provided the "
"specified field is going to be grabbed ('text' "
"by default)."
)
out_help = "Path to save the resulting .spacy file" out_help = "Path to save the resulting .spacy file"
code_help = ( code_help = (
"Path to Python file with additional " "code (registered functions) to be imported" "Path to Python file with additional " "code (registered functions) to be imported"
@ -56,7 +53,7 @@ def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
""" """
for entry in srsly.read_jsonl(path): for entry in srsly.read_jsonl(path):
if field not in entry: if field not in entry:
raise msg.fail( msg.fail(
f"{path} does not contain the required '{field}' field.", exits=1 f"{path} does not contain the required '{field}' field.", exits=1
) )
else: else:
@ -117,11 +114,14 @@ def apply(
batch_size: int, batch_size: int,
n_process: int, n_process: int,
): ):
docbin = DocBin(store_user_data=True)
paths = walk_directory(data_path)
if len(paths) == 0:
msg.fail("Did not find data to process,"
f" {data_path} seems to be an empty directory.", exits=1)
nlp = load_model(model) nlp = load_model(model)
msg.good(f"Loaded model {model}") msg.good(f"Loaded model {model}")
vocab = nlp.vocab vocab = nlp.vocab
docbin = DocBin(store_user_data=True)
paths = walk_directory(data_path)
streams: List[DocOrStrStream] = [] streams: List[DocOrStrStream] = []
text_files = [] text_files = []
for path in paths: for path in paths: