💫 Improve converters and training data file formats (#3374)

* Populate converter argument info automatically * Add conversion option for msgpack * Update docs * Allow reading training data from JSONL
2025-10-25 05:01:02 +03:00 · 2019-03-08 23:15:23 +01:00 · 2019-03-08 23:15:23 +01:00 · 76764fcf59
commit 76764fcf59
parent 296446a1c8
4 changed files with 59 additions and 24 deletions
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -23,15 +23,16 @@ CONVERTERS = {
 }

 # File types
-FILE_TYPES = ("json", "jsonl")
+FILE_TYPES = ("json", "jsonl", "msg")
+FILE_TYPES_STDOUT = ("json", "jsonl")


@plac.annotations(
    input_file=("Input file", "positional", None, str),
-    output_dir=("Output directory for converted file", "positional", None, str),
-    file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
+    output_dir=("Output directory. '-' for stdout.", "positional", None, str),
+    file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
    n_sents=("Number of sentences per doc", "option", "n", int),
-    converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
+    converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
    lang=("Language (if tokenizer required)", "option", "l", str),
    morphology=("Enable appending morphology to tags", "flag", "m", bool),
 )
@ -58,6 +59,13 @@ def convert(
            "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
            exits=1,
        )
+    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
+        # TODO: support msgpack via stdout in srsly?
+        msg.fail(
+            "Can't write .{} data to stdout.".format(file_type),
+            "Please specify an output directory.",
+            exits=1,
+        )
    if not input_path.exists():
        msg.fail("Input file not found", input_path, exits=1)
    if output_dir != "-" and not Path(output_dir).exists():
@ -78,6 +86,8 @@ def convert(
            srsly.write_json(output_file, data)
        elif file_type == "jsonl":
            srsly.write_jsonl(output_file, data)
+        elif file_type == "msg":
+            srsly.write_msgpack(output_file, data)
        msg.good("Generated output file ({} documents)".format(len(data)), output_file)
    else:
        # Print to stdout
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -342,7 +342,7 @@ class Errors(object):
            "equal to span length ({span_len}).")
    E122 = ("Cannot find token to be split. Did it get merged?")
    E123 = ("Cannot find head of token to be split. Did it get merged?")
-    E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg")
+    E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
    E125 = ("Unexpected value: {value}")
    E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -153,10 +153,13 @@ class GoldCorpus(object):
            loc = util.ensure_path(loc)
            if loc.parts[-1].endswith("json"):
                gold_tuples = read_json_file(loc)
+            elif loc.parts[-1].endswith("jsonl"):
+                gold_tuples = srsly.read_jsonl(loc)
            elif loc.parts[-1].endswith("msg"):
                gold_tuples = srsly.read_msgpack(loc)
            else:
-                raise ValueError(Errors.E124.format(path=path2str(loc)))
+                supported = ("json", "jsonl", "msg")
+                raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
            for item in gold_tuples:
                yield item
                i += len(item[1])
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -134,28 +134,50 @@ converter can be specified on the command line, or chosen based on the file
 extension of the input file.

 ```bash
-$ python -m spacy convert [input_file] [output_dir] [--converter] [--n-sents]
-[--morphology]
+$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
+[--n-sents] [--morphology] [--lang]
 ```

-| Argument                                     | Type       | Description                                                |
-| -------------------------------------------- | ---------- | ---------------------------------------------------------- |
-| `input_file`                                 | positional | Input file.                                                |
-| `output_dir`                                 | positional | Output directory for converted JSON file.                  |
-| `converter`, `-c` <Tag variant="new">2</Tag> | option     | Name of converter to use (see below).                      |
-| `--n-sents`, `-n`                            | option     | Number of sentences per document.                          |
-| `--morphology`, `-m`                         | option     | Enable appending morphology to tags.                       |
-| `--help`, `-h`                               | flag       | Show help message and available arguments.                 |
-| **CREATES**                                  | JSON       | Data in spaCy's [JSON format](/api/annotation#json-input). |
+| Argument                                         | Type       | Description                                                                                       |
+| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- |
+| `input_file`                                     | positional | Input file.                                                                                       |
+| `output_dir`                                     | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
+| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option     | Type of file to create (see below).                                                               |
+| `--converter`, `-c` <Tag variant="new">2</Tag>   | option     | Name of converter to use (see below).                                                             |
+| `--n-sents`, `-n`                                | option     | Number of sentences per document.                                                                 |
+| `--morphology`, `-m`                             | option     | Enable appending morphology to tags.                                                              |
+| `--lang`, `-l` <Tag variant="new">2.1</Tag>      | option     | Language code (if tokenizer required).                                                            |
+| `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                        |
+| **CREATES**                                      | JSON       | Data in spaCy's [JSON format](/api/annotation#json-input).                                        |

-The following file format converters are available:
+### Output file types {new="2.1"}

-| ID                | Description                                                     |
-| ----------------- | --------------------------------------------------------------- |
-| `auto`            | Automatically pick converter based on file extension (default). |
-| `conllu`, `conll` | Universal Dependencies `.conllu` or `.conll` format.            |
-| `ner`             | Tab-based named entity recognition format.                      |
-| `iob`             | IOB or IOB2 named entity recognition format.                    |
+> #### Which format should I choose?
+>
+> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means
+> that there's one JSON object per line. Unlike a regular JSON file, it can also
+> be read in line-by-line and you won't have to parse the _entire file_ first.
+> This makes it a very convenient format for larger corpora.
+
+All output files generated by this command are compatible with
+[`spacy train`](/api/cli#train).
+
+| ID      | Description                       |
+| ------- | --------------------------------- |
+| `jsonl` | Newline-delimited JSON (default). |
+| `json`  | Regular JSON.                     |
+| `msg`   | Binary MessagePack format.        |
+
+### Converter options
+
+<!-- TODO: document jsonl option – maybe update it? -->
+
+| ID                             | Description                                                     |
+| ------------------------------ | --------------------------------------------------------------- |
+| `auto`                         | Automatically pick converter based on file extension (default). |
+| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format.            |
+| `ner`                          | Tab-based named entity recognition format.                      |
+| `iob`                          | IOB or IOB2 named entity recognition format.                    |

 ## Train {#train}