From 9b404ea33c428c1f942213ab2194f95ab02dd4e2 Mon Sep 17 00:00:00 2001 From: kadarakos Date: Wed, 26 Oct 2022 14:01:17 +0000 Subject: [PATCH] don't warn but raise --- spacy/cli/apply.py | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py index 85b843830..1784c3d83 100644 --- a/spacy/cli/apply.py +++ b/spacy/cli/apply.py @@ -3,7 +3,7 @@ import srsly from itertools import chain from pathlib import Path -from typing import Optional, Generator, Union, List, Iterable, cast +from typing import Optional, List, Iterable, cast, Union from wasabi import msg @@ -25,6 +25,8 @@ code_help = ("Path to Python file with additional " "code (registered functions) to be imported") gold_help = "Use gold preprocessing provided in the .spacy files" +DocOrStrStream = Union[Iterable[str], Iterable[Doc]] + def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]: """ @@ -49,26 +51,14 @@ def _stream_jsonl(path: Path) -> Iterable[str]: yield entry["text"] -def _maybe_read_text(path: Path) -> Union[str, None]: +def _stream_texts(paths: Iterable[Path]) -> Iterable[str]: """ - Try to read the text file from the provided path. - When encoutering a decoding error just warn and pass. - """ - with open(path, 'r') as fin: - try: - text = fin.read() - return text - except UnicodeDecodeError as e: - msg.warn(f"Skipping file {path}") - return None - - -def _stream_texts(paths: List[Path]) -> Iterable[Union[str, None]]: - """ - Yields strings or None when decoding error is encountered. + Yields strings from text files in paths. """ for path in paths: - yield _maybe_read_text(path) + with open(path, 'r') as fin: + text = fin.read() + yield text @app.command("apply") @@ -114,7 +104,7 @@ def apply( vocab = nlp.vocab docbin = DocBin() paths = walk_directory(data_path) - streams: List[Union[Iterable[str], Iterable[Doc]]] = [] + streams: List[DocOrStrStream] = [] text_files = [] for path in paths: if path.suffix == ".spacy": @@ -124,8 +114,8 @@ def apply( else: text_files.append(path) if len(text_files) > 0: - streams.append(filter(None, _stream_texts(text_files))) - datagen = cast(Iterable[Union[str, Doc]], chain(*streams)) + streams.append(_stream_texts(text_files)) + datagen = cast(DocOrStrStream, chain(*streams)) for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)): docbin.add(doc) if output_path.is_dir():