diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index a909a4241..12a3d2698 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -23,15 +23,16 @@ CONVERTERS = {
}
# File types
-FILE_TYPES = ("json", "jsonl")
+FILE_TYPES = ("json", "jsonl", "msg")
+FILE_TYPES_STDOUT = ("json", "jsonl")
@plac.annotations(
input_file=("Input file", "positional", None, str),
- output_dir=("Output directory for converted file", "positional", None, str),
- file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
+ output_dir=("Output directory. '-' for stdout.", "positional", None, str),
+ file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
n_sents=("Number of sentences per doc", "option", "n", int),
- converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
+ converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
lang=("Language (if tokenizer required)", "option", "l", str),
morphology=("Enable appending morphology to tags", "flag", "m", bool),
)
@@ -58,6 +59,13 @@ def convert(
"Supported file types: '{}'".format(", ".join(FILE_TYPES)),
exits=1,
)
+ if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
+ # TODO: support msgpack via stdout in srsly?
+ msg.fail(
+ "Can't write .{} data to stdout.".format(file_type),
+ "Please specify an output directory.",
+ exits=1,
+ )
if not input_path.exists():
msg.fail("Input file not found", input_path, exits=1)
if output_dir != "-" and not Path(output_dir).exists():
@@ -78,6 +86,8 @@ def convert(
srsly.write_json(output_file, data)
elif file_type == "jsonl":
srsly.write_jsonl(output_file, data)
+ elif file_type == "msg":
+ srsly.write_msgpack(output_file, data)
msg.good("Generated output file ({} documents)".format(len(data)), output_file)
else:
# Print to stdout
diff --git a/spacy/errors.py b/spacy/errors.py
index 13382d146..c409e5a0c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -342,7 +342,7 @@ class Errors(object):
"equal to span length ({span_len}).")
E122 = ("Cannot find token to be split. Did it get merged?")
E123 = ("Cannot find head of token to be split. Did it get merged?")
- E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg")
+ E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
E125 = ("Unexpected value: {value}")
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
"This is likely a bug in spaCy, so feel free to open an issue.")
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index d03d13d2d..02306c651 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -153,10 +153,13 @@ class GoldCorpus(object):
loc = util.ensure_path(loc)
if loc.parts[-1].endswith("json"):
gold_tuples = read_json_file(loc)
+ elif loc.parts[-1].endswith("jsonl"):
+ gold_tuples = srsly.read_jsonl(loc)
elif loc.parts[-1].endswith("msg"):
gold_tuples = srsly.read_msgpack(loc)
else:
- raise ValueError(Errors.E124.format(path=path2str(loc)))
+ supported = ("json", "jsonl", "msg")
+ raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
for item in gold_tuples:
yield item
i += len(item[1])
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index ee4c3787b..2d3c13e37 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -134,28 +134,50 @@ converter can be specified on the command line, or chosen based on the file
extension of the input file.
```bash
-$ python -m spacy convert [input_file] [output_dir] [--converter] [--n-sents]
-[--morphology]
+$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
+[--n-sents] [--morphology] [--lang]
```
-| Argument | Type | Description |
-| -------------------------------------------- | ---------- | ---------------------------------------------------------- |
-| `input_file` | positional | Input file. |
-| `output_dir` | positional | Output directory for converted JSON file. |
-| `converter`, `-c` 2 | option | Name of converter to use (see below). |
-| `--n-sents`, `-n` | option | Number of sentences per document. |
-| `--morphology`, `-m` | option | Enable appending morphology to tags. |
-| `--help`, `-h` | flag | Show help message and available arguments. |
-| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). |
+| Argument | Type | Description |
+| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- |
+| `input_file` | positional | Input file. |
+| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
+| `--file-type`, `-t` 2.1 | option | Type of file to create (see below). |
+| `--converter`, `-c` 2 | option | Name of converter to use (see below). |
+| `--n-sents`, `-n` | option | Number of sentences per document. |
+| `--morphology`, `-m` | option | Enable appending morphology to tags. |
+| `--lang`, `-l` 2.1 | option | Language code (if tokenizer required). |
+| `--help`, `-h` | flag | Show help message and available arguments. |
+| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). |
-The following file format converters are available:
+### Output file types {new="2.1"}
-| ID | Description |
-| ----------------- | --------------------------------------------------------------- |
-| `auto` | Automatically pick converter based on file extension (default). |
-| `conllu`, `conll` | Universal Dependencies `.conllu` or `.conll` format. |
-| `ner` | Tab-based named entity recognition format. |
-| `iob` | IOB or IOB2 named entity recognition format. |
+> #### Which format should I choose?
+>
+> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means
+> that there's one JSON object per line. Unlike a regular JSON file, it can also
+> be read in line-by-line and you won't have to parse the _entire file_ first.
+> This makes it a very convenient format for larger corpora.
+
+All output files generated by this command are compatible with
+[`spacy train`](/api/cli#train).
+
+| ID | Description |
+| ------- | --------------------------------- |
+| `jsonl` | Newline-delimited JSON (default). |
+| `json` | Regular JSON. |
+| `msg` | Binary MessagePack format. |
+
+### Converter options
+
+
+
+| ID | Description |
+| ------------------------------ | --------------------------------------------------------------- |
+| `auto` | Automatically pick converter based on file extension (default). |
+| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
+| `ner` | Tab-based named entity recognition format. |
+| `iob` | IOB or IOB2 named entity recognition format. |
## Train {#train}