From dbd829f0ed2dba3eb6eb5b59b18396ed38e326b9 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 4 Jan 2023 12:51:40 +0900 Subject: [PATCH 1/3] Fix inconsistency in displaCy docs about page option (#12047) * Fix inconsistency in displaCy docs about page option The `page` option, which wraps the output SVG in HTML, is true by default for `serve` but not for `render`. The `render` docs were wrong though, so this updates them. * Update the same statement in more docs A few renderers used the same language --- spacy/displacy/__init__.py | 2 +- spacy/displacy/render.py | 4 ++-- website/docs/api/top-level.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index bc32001d7..2f2058b8e 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -36,7 +36,7 @@ def render( jupyter (bool): Override Jupyter auto-detection. options (dict): Visualiser-specific options, e.g. colors. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. - RETURNS (str): Rendered HTML markup. + RETURNS (str): Rendered SVG or HTML markup. DOCS: https://spacy.io/api/top-level#displacy.render USAGE: https://spacy.io/usage/visualizers diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 50dc3466c..f74222dc2 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -94,7 +94,7 @@ class SpanRenderer: parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (str): Rendered HTML markup. + RETURNS (str): Rendered SVG or HTML markup. """ rendered = [] for i, p in enumerate(parsed): @@ -510,7 +510,7 @@ class EntityRenderer: parsed (list): Dependency parses to render. page (bool): Render parses wrapped as full HTML page. minify (bool): Minify HTML markup. - RETURNS (str): Rendered HTML markup. + RETURNS (str): Rendered SVG or HTML markup. """ rendered = [] for i, p in enumerate(parsed): diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 883c5e3b9..6a63e07da 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -266,7 +266,7 @@ Render a dependency parse tree or named entity visualization. | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ | | `style` | Visualization style, `"dep"`, `"ent"` or `"span"` 3.3. Defaults to `"dep"`. ~~str~~ | -| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | +| `page` | Render markup as full HTML page. Defaults to `False`. ~~bool~~ | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | | `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | From 7f6c638c3acd732c0b52a45a2b3ad0388cd1ae66 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 5 Jan 2023 10:21:00 +0100 Subject: [PATCH 2/3] fix processing of "auto" in convert (#12050) * fix processing of "auto" in walk_directory * add check for None * move AUTO check to convert and fix verification of args * add specific CLI test with CliRunner * cleanup * more cleanup * update docstring --- spacy/cli/_util.py | 4 ++++ spacy/cli/convert.py | 26 ++++++++++++++++---------- spacy/tests/test_cli.py | 26 +++++++++++++++++++++++++- spacy/tests/test_cli_app.py | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 11 deletions(-) create mode 100644 spacy/tests/test_cli_app.py diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index c46abffe5..0f4e9f599 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -583,6 +583,10 @@ def setup_gpu(use_gpu: int, silent=None) -> None: def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]: + """Given a directory and a suffix, recursively find all files matching the suffix. + Directories or files with names beginning with a . are ignored, but hidden flags on + filesystems are not checked. + When provided with a suffix `None`, there is no suffix-based filtering.""" if not path.is_dir(): return [path] paths = [path] diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 7f365ae2c..68d454b3e 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = { "json": json_to_docs, } +AUTO = "auto" + # File types that can be written to stdout FILE_TYPES_STDOUT = ("json",) @@ -49,7 +51,7 @@ def convert_cli( model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"), morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"), merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"), - converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), + converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"), ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True), lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"), concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"), @@ -70,8 +72,8 @@ def convert_cli( output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir silent = output_dir == "-" msg = Printer(no_print=silent) - verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map) converter = _get_converter(msg, converter, input_path) + verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map) convert( input_path, output_dir, @@ -100,7 +102,7 @@ def convert( model: Optional[str] = None, morphology: bool = False, merge_subtokens: bool = False, - converter: str = "auto", + converter: str, ner_map: Optional[Path] = None, lang: Optional[str] = None, concatenate: bool = False, @@ -212,18 +214,22 @@ def verify_cli_args( input_locs = walk_directory(input_path, converter) if len(input_locs) == 0: msg.fail("No input files in directory", input_path, exits=1) - file_types = list(set([loc.suffix[1:] for loc in input_locs])) - if converter == "auto" and len(file_types) >= 2: - file_types_str = ",".join(file_types) - msg.fail("All input files must be same type", file_types_str, exits=1) - if converter != "auto" and converter not in CONVERTERS: + if converter not in CONVERTERS: msg.fail(f"Can't find converter for {converter}", exits=1) def _get_converter(msg, converter, input_path: Path): if input_path.is_dir(): - input_path = walk_directory(input_path, converter)[0] - if converter == "auto": + if converter == AUTO: + input_locs = walk_directory(input_path, suffix=None) + file_types = list(set([loc.suffix[1:] for loc in input_locs])) + if len(file_types) >= 2: + file_types_str = ",".join(file_types) + msg.fail("All input files must be same type", file_types_str, exits=1) + input_path = input_locs[0] + else: + input_path = walk_directory(input_path, suffix=converter)[0] + if converter == AUTO: converter = input_path.suffix[1:] if converter == "ner" or converter == "iob": with input_path.open(encoding="utf8") as file_: diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index c6768a3fd..c88e20de2 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -4,6 +4,7 @@ from collections import Counter from typing import Tuple, List, Dict, Any import pkg_resources import time +from pathlib import Path import spacy import numpy @@ -15,7 +16,7 @@ from thinc.api import Config, ConfigValidationError from spacy import about from spacy.cli import info -from spacy.cli._util import is_subpath_of, load_project_config +from spacy.cli._util import is_subpath_of, load_project_config, walk_directory from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands @@ -1185,3 +1186,26 @@ def test_upload_download_local_file(): download_file(remote_file, local_file) with local_file.open(mode="r") as file_: assert file_.read() == content + + +def test_walk_directory(): + with make_tempdir() as d: + files = [ + "data1.iob", + "data2.iob", + "data3.json", + "data4.conll", + "data5.conll", + "data6.conll", + "data7.txt", + ] + + for f in files: + Path(d / f).touch() + + assert (len(walk_directory(d))) == 7 + assert (len(walk_directory(d, suffix=None))) == 7 + assert (len(walk_directory(d, suffix="json"))) == 1 + assert (len(walk_directory(d, suffix="iob"))) == 2 + assert (len(walk_directory(d, suffix="conll"))) == 3 + assert (len(walk_directory(d, suffix="pdf"))) == 0 diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py new file mode 100644 index 000000000..873a3ff66 --- /dev/null +++ b/spacy/tests/test_cli_app.py @@ -0,0 +1,33 @@ +import os +from pathlib import Path +from typer.testing import CliRunner + +from spacy.cli._util import app +from .util import make_tempdir + + +def test_convert_auto(): + with make_tempdir() as d_in, make_tempdir() as d_out: + for f in ["data1.iob", "data2.iob", "data3.iob"]: + Path(d_in / f).touch() + + # ensure that "automatic" suffix detection works + result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)]) + assert "Generated output file" in result.stdout + out_files = os.listdir(d_out) + assert len(out_files) == 3 + assert "data1.spacy" in out_files + assert "data2.spacy" in out_files + assert "data3.spacy" in out_files + + +def test_convert_auto_conflict(): + with make_tempdir() as d_in, make_tempdir() as d_out: + for f in ["data1.iob", "data2.iob", "data3.json"]: + Path(d_in / f).touch() + + # ensure that "automatic" suffix detection warns when there are different file types + result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)]) + assert "All input files must be same type" in result.stdout + out_files = os.listdir(d_out) + assert len(out_files) == 0 From f1dcdefc8abb21345680b79e9d538f06cf62bca0 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Thu, 5 Jan 2023 11:46:04 +0100 Subject: [PATCH 3/3] Add version tag to `before_update` config key (#12059) --- website/docs/api/data-formats.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 768844cf3..420e827a0 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train). | `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | | `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | -| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ | +| `before_update` 3.5 | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ | | `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | | `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | | `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |