diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index a909a4241..12a3d2698 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -23,15 +23,16 @@ CONVERTERS = { } # File types -FILE_TYPES = ("json", "jsonl") +FILE_TYPES = ("json", "jsonl", "msg") +FILE_TYPES_STDOUT = ("json", "jsonl") @plac.annotations( input_file=("Input file", "positional", None, str), - output_dir=("Output directory for converted file", "positional", None, str), - file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str), + output_dir=("Output directory. '-' for stdout.", "positional", None, str), + file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str), n_sents=("Number of sentences per doc", "option", "n", int), - converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str), + converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), lang=("Language (if tokenizer required)", "option", "l", str), morphology=("Enable appending morphology to tags", "flag", "m", bool), ) @@ -58,6 +59,13 @@ def convert( "Supported file types: '{}'".format(", ".join(FILE_TYPES)), exits=1, ) + if file_type not in FILE_TYPES_STDOUT and output_dir == "-": + # TODO: support msgpack via stdout in srsly? + msg.fail( + "Can't write .{} data to stdout.".format(file_type), + "Please specify an output directory.", + exits=1, + ) if not input_path.exists(): msg.fail("Input file not found", input_path, exits=1) if output_dir != "-" and not Path(output_dir).exists(): @@ -78,6 +86,8 @@ def convert( srsly.write_json(output_file, data) elif file_type == "jsonl": srsly.write_jsonl(output_file, data) + elif file_type == "msg": + srsly.write_msgpack(output_file, data) msg.good("Generated output file ({} documents)".format(len(data)), output_file) else: # Print to stdout diff --git a/spacy/errors.py b/spacy/errors.py index 13382d146..c409e5a0c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -342,7 +342,7 @@ class Errors(object): "equal to span length ({span_len}).") E122 = ("Cannot find token to be split. Did it get merged?") E123 = ("Cannot find head of token to be split. Did it get merged?") - E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg") + E124 = ("Cannot read from file: {path}. Supported formats: {formats}") E125 = ("Unexpected value: {value}") E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. " "This is likely a bug in spaCy, so feel free to open an issue.") diff --git a/spacy/gold.pyx b/spacy/gold.pyx index d03d13d2d..02306c651 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -153,10 +153,13 @@ class GoldCorpus(object): loc = util.ensure_path(loc) if loc.parts[-1].endswith("json"): gold_tuples = read_json_file(loc) + elif loc.parts[-1].endswith("jsonl"): + gold_tuples = srsly.read_jsonl(loc) elif loc.parts[-1].endswith("msg"): gold_tuples = srsly.read_msgpack(loc) else: - raise ValueError(Errors.E124.format(path=path2str(loc))) + supported = ("json", "jsonl", "msg") + raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported)) for item in gold_tuples: yield item i += len(item[1]) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index ee4c3787b..2d3c13e37 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -134,28 +134,50 @@ converter can be specified on the command line, or chosen based on the file extension of the input file. ```bash -$ python -m spacy convert [input_file] [output_dir] [--converter] [--n-sents] -[--morphology] +$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter] +[--n-sents] [--morphology] [--lang] ``` -| Argument | Type | Description | -| -------------------------------------------- | ---------- | ---------------------------------------------------------- | -| `input_file` | positional | Input file. | -| `output_dir` | positional | Output directory for converted JSON file. | -| `converter`, `-c` 2 | option | Name of converter to use (see below). | -| `--n-sents`, `-n` | option | Number of sentences per document. | -| `--morphology`, `-m` | option | Enable appending morphology to tags. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). | +| Argument | Type | Description | +| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- | +| `input_file` | positional | Input file. | +| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. | +| `--file-type`, `-t` 2.1 | option | Type of file to create (see below). | +| `--converter`, `-c` 2 | option | Name of converter to use (see below). | +| `--n-sents`, `-n` | option | Number of sentences per document. | +| `--morphology`, `-m` | option | Enable appending morphology to tags. | +| `--lang`, `-l` 2.1 | option | Language code (if tokenizer required). | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). | -The following file format converters are available: +### Output file types {new="2.1"} -| ID | Description | -| ----------------- | --------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension (default). | -| `conllu`, `conll` | Universal Dependencies `.conllu` or `.conll` format. | -| `ner` | Tab-based named entity recognition format. | -| `iob` | IOB or IOB2 named entity recognition format. | +> #### Which format should I choose? +> +> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means +> that there's one JSON object per line. Unlike a regular JSON file, it can also +> be read in line-by-line and you won't have to parse the _entire file_ first. +> This makes it a very convenient format for larger corpora. + +All output files generated by this command are compatible with +[`spacy train`](/api/cli#train). + +| ID | Description | +| ------- | --------------------------------- | +| `jsonl` | Newline-delimited JSON (default). | +| `json` | Regular JSON. | +| `msg` | Binary MessagePack format. | + +### Converter options + + + +| ID | Description | +| ------------------------------ | --------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension (default). | +| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | +| `ner` | Tab-based named entity recognition format. | +| `iob` | IOB or IOB2 named entity recognition format. | ## Train {#train}