diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index c46abffe5..0f4e9f599 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -583,6 +583,10 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
+ """Given a directory and a suffix, recursively find all files matching the suffix.
+ Directories or files with names beginning with a . are ignored, but hidden flags on
+ filesystems are not checked.
+ When provided with a suffix `None`, there is no suffix-based filtering."""
if not path.is_dir():
return [path]
paths = [path]
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 7f365ae2c..68d454b3e 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
"json": json_to_docs,
}
+AUTO = "auto"
+
# File types that can be written to stdout
FILE_TYPES_STDOUT = ("json",)
@@ -49,7 +51,7 @@ def convert_cli(
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
- converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
+ converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
@@ -70,8 +72,8 @@ def convert_cli(
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
silent = output_dir == "-"
msg = Printer(no_print=silent)
- verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
converter = _get_converter(msg, converter, input_path)
+ verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
convert(
input_path,
output_dir,
@@ -100,7 +102,7 @@ def convert(
model: Optional[str] = None,
morphology: bool = False,
merge_subtokens: bool = False,
- converter: str = "auto",
+ converter: str,
ner_map: Optional[Path] = None,
lang: Optional[str] = None,
concatenate: bool = False,
@@ -212,18 +214,22 @@ def verify_cli_args(
input_locs = walk_directory(input_path, converter)
if len(input_locs) == 0:
msg.fail("No input files in directory", input_path, exits=1)
- file_types = list(set([loc.suffix[1:] for loc in input_locs]))
- if converter == "auto" and len(file_types) >= 2:
- file_types_str = ",".join(file_types)
- msg.fail("All input files must be same type", file_types_str, exits=1)
- if converter != "auto" and converter not in CONVERTERS:
+ if converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
def _get_converter(msg, converter, input_path: Path):
if input_path.is_dir():
- input_path = walk_directory(input_path, converter)[0]
- if converter == "auto":
+ if converter == AUTO:
+ input_locs = walk_directory(input_path, suffix=None)
+ file_types = list(set([loc.suffix[1:] for loc in input_locs]))
+ if len(file_types) >= 2:
+ file_types_str = ",".join(file_types)
+ msg.fail("All input files must be same type", file_types_str, exits=1)
+ input_path = input_locs[0]
+ else:
+ input_path = walk_directory(input_path, suffix=converter)[0]
+ if converter == AUTO:
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":
with input_path.open(encoding="utf8") as file_:
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index bc32001d7..2f2058b8e 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -36,7 +36,7 @@ def render(
jupyter (bool): Override Jupyter auto-detection.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
- RETURNS (str): Rendered HTML markup.
+ RETURNS (str): Rendered SVG or HTML markup.
DOCS: https://spacy.io/api/top-level#displacy.render
USAGE: https://spacy.io/usage/visualizers
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 50dc3466c..f74222dc2 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -94,7 +94,7 @@ class SpanRenderer:
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
- RETURNS (str): Rendered HTML markup.
+ RETURNS (str): Rendered SVG or HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):
@@ -510,7 +510,7 @@ class EntityRenderer:
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
- RETURNS (str): Rendered HTML markup.
+ RETURNS (str): Rendered SVG or HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index c6768a3fd..c88e20de2 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -4,6 +4,7 @@ from collections import Counter
from typing import Tuple, List, Dict, Any
import pkg_resources
import time
+from pathlib import Path
import spacy
import numpy
@@ -15,7 +16,7 @@ from thinc.api import Config, ConfigValidationError
from spacy import about
from spacy.cli import info
-from spacy.cli._util import is_subpath_of, load_project_config
+from spacy.cli._util import is_subpath_of, load_project_config, walk_directory
from spacy.cli._util import parse_config_overrides, string_to_list
from spacy.cli._util import substitute_project_variables
from spacy.cli._util import validate_project_commands
@@ -1185,3 +1186,26 @@ def test_upload_download_local_file():
download_file(remote_file, local_file)
with local_file.open(mode="r") as file_:
assert file_.read() == content
+
+
+def test_walk_directory():
+ with make_tempdir() as d:
+ files = [
+ "data1.iob",
+ "data2.iob",
+ "data3.json",
+ "data4.conll",
+ "data5.conll",
+ "data6.conll",
+ "data7.txt",
+ ]
+
+ for f in files:
+ Path(d / f).touch()
+
+ assert (len(walk_directory(d))) == 7
+ assert (len(walk_directory(d, suffix=None))) == 7
+ assert (len(walk_directory(d, suffix="json"))) == 1
+ assert (len(walk_directory(d, suffix="iob"))) == 2
+ assert (len(walk_directory(d, suffix="conll"))) == 3
+ assert (len(walk_directory(d, suffix="pdf"))) == 0
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
new file mode 100644
index 000000000..873a3ff66
--- /dev/null
+++ b/spacy/tests/test_cli_app.py
@@ -0,0 +1,33 @@
+import os
+from pathlib import Path
+from typer.testing import CliRunner
+
+from spacy.cli._util import app
+from .util import make_tempdir
+
+
+def test_convert_auto():
+ with make_tempdir() as d_in, make_tempdir() as d_out:
+ for f in ["data1.iob", "data2.iob", "data3.iob"]:
+ Path(d_in / f).touch()
+
+ # ensure that "automatic" suffix detection works
+ result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
+ assert "Generated output file" in result.stdout
+ out_files = os.listdir(d_out)
+ assert len(out_files) == 3
+ assert "data1.spacy" in out_files
+ assert "data2.spacy" in out_files
+ assert "data3.spacy" in out_files
+
+
+def test_convert_auto_conflict():
+ with make_tempdir() as d_in, make_tempdir() as d_out:
+ for f in ["data1.iob", "data2.iob", "data3.json"]:
+ Path(d_in / f).touch()
+
+ # ensure that "automatic" suffix detection warns when there are different file types
+ result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
+ assert "All input files must be same type" in result.stdout
+ out_files = os.listdir(d_out)
+ assert len(out_files) == 0
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 768844cf3..420e827a0 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
-| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ |
+| `before_update` 3.5 | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ |
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 883c5e3b9..6a63e07da 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -266,7 +266,7 @@ Render a dependency parse tree or named entity visualization.
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ |
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` 3.3. Defaults to `"dep"`. ~~str~~ |
-| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
+| `page` | Render markup as full HTML page. Defaults to `False`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |