diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py index 31fff3616..8988d5add 100644 --- a/spacy/cli/apply.py +++ b/spacy/cli/apply.py @@ -47,13 +47,15 @@ def _stream_jsonl(path: Path, field) -> Iterable[str]: not found it raises error. """ for entry in srsly.read_jsonl(path): + print(entry) + print(field) if field not in entry: raise msg.fail( f"{path} does not contain the required '{field}' field.", exits=1 ) else: - yield entry["text"] + yield entry[field] def _stream_texts(paths: Iterable[Path]) -> Iterable[str]: @@ -73,7 +75,7 @@ def apply_cli( data_path: Path = Arg(..., help=path_help, exists=True), output_file: Path = Arg(..., help=out_help, dir_okay=False), code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help), - field: str = Opt("text", "--field", "-f", help="Field to grab from .jsonl"), + json_field: str = Opt("text", "--field", "-f", help="Field to grab from .jsonl"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."), batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."), n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.") @@ -82,7 +84,7 @@ def apply_cli( Apply a trained pipeline to documents to get predictions. Expects a loadable spaCy pipeline and path to the data, which can be a directory or a file. - The data files can be provided multiple formats: + The data files can be provided multiple formats: 1. .spacy files 2. .jsonl files with a specified "field" to read the text from. 3. Files with any other extension are assumed to be containing @@ -91,18 +93,19 @@ def apply_cli( """ import_code(code_path) setup_gpu(use_gpu) - apply(data_path, output, model, batch_size, n_process) + apply(data_path, output_file, model, json_field, batch_size, n_process) def apply( data_path: Path, - output: Path, + output_file: Path, model: str, + json_field: str, batch_size: int, n_process: int, ): data_path = ensure_path(data_path) - output_path = ensure_path(output) + output_file = ensure_path(output_file) if not data_path.exists(): msg.fail("Couldn't find data path.", data_path, exits=1) nlp = load_model(model) @@ -116,7 +119,7 @@ def apply( if path.suffix == ".spacy": streams.append(_stream_docbin(path, vocab)) elif path.suffix == ".jsonl": - streams.append(_stream_jsonl(path, field)) + streams.append(_stream_jsonl(path, json_field)) else: text_files.append(path) if len(text_files) > 0: @@ -124,6 +127,6 @@ def apply( datagen = cast(DocOrStrStream, chain(*streams)) for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)): docbin.add(doc) - if not output_file.endswith(".spacy"): + if output_file.suffix == "": output_file += ".spacy" docbin.to_disk(output_file) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index e1b3fcdb2..964f03a79 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -864,7 +864,7 @@ def test_span_length_freq_dist_output_must_be_correct(): def test_applycli_empty_dir(): with make_tempdir() as data_path: output = data_path / "test.spacy" - apply(data_path, output, "blank:en", 1, 1) + apply(data_path, output, "blank:en", "text", 1, 1) def test_applycli_docbin(): @@ -875,35 +875,29 @@ def test_applycli_docbin(): # test empty DocBin case docbin = DocBin() docbin.to_disk(data_path / "testin.spacy") - apply(data_path, output, "blank:en", 1, 1) + apply(data_path, output, "blank:en", "text", 1, 1) docbin.add(doc) docbin.to_disk(data_path / "testin.spacy") - apply(data_path, output, "blank:en", 1, 1) + apply(data_path, output, "blank:en", "text", 1, 1) def test_applycli_jsonl(): with make_tempdir() as data_path: output = data_path / "testout.spacy" - data = [{"text": "Testing apply cli.", "key": 234}] + data = [{"field": "Testing apply cli.", "key": 234}] srsly.write_jsonl(data_path / "test.jsonl", data) - apply(data_path, output, "blank:en", 1, 1) - data = [{"key": 234}] + apply(data_path, output, "blank:en", "field", 1, 1) + data = [{"field": "234"}] srsly.write_jsonl(data_path / "test2.jsonl", data) - # test no "text" field case - with pytest.raises(ValueError, match="test2.jsonl"): - apply(data_path, output, "blank:en", 1, 1) + apply(data_path, output, "blank:en", "field", 1, 1) def test_applycli_txt(): with make_tempdir() as data_path: output = data_path / "testout.spacy" - data = [{"text": "Testing apply cli.", "key": 234}] - srsly.write_jsonl(data_path / "test.jsonl", data) - apply(data_path, output, "blank:en", 1, 1) - data = [{"key": 234}] - srsly.write_jsonl(data_path / "test2.jsonl", data) - with pytest.raises(ValueError, match="test2.jsonl"): - apply(data_path, output, "blank:en", 1, 1) + with open(data_path / "test.foo", "w") as ftest: + ftest.write("Testing apply cli.") + apply(data_path, output, "blank:en", "text", 1, 1) def test_applycli_mixed(): @@ -919,7 +913,7 @@ def test_applycli_mixed(): docbin.to_disk(data_path / "testin.spacy") with open(data_path / "test.txt", "w") as ftest: ftest.write(text) - apply(data_path, output, "blank:en", 1, 1) + apply(data_path, output, "blank:en", "text", 1, 1) # Check whether it worked result = list(DocBin().from_disk(output).get_docs(nlp.vocab)) assert len(result) == 3