From fa8e097c04e4224a3fd72f03c2922f5844e9703a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 3 Jul 2020 15:42:04 +0200 Subject: [PATCH] Update convert docs [ci skip] --- website/docs/api/cli.md | 55 +++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index b26e034ce..c90d7c69c 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -105,40 +105,29 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter] [--n-sents] [--morphology] [--lang] ``` -| Argument | Type | Description | -| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- | -| `input_file` | positional | Input file. | -| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. | -| `--file-type`, `-t` 2.1 | option | Type of file to create (see below). | -| `--converter`, `-c` 2 | option | Name of converter to use (see below). | -| `--n-sents`, `-n` | option | Number of sentences per document. | -| `--seg-sents`, `-s` 2.2 | flag | Segment sentences (for `-c ner`) | -| `--model`, `-b` 2.2 | option | Model for parser-based sentence segmentation (for `-s`) | -| `--morphology`, `-m` | option | Enable appending morphology to tags. | -| `--lang`, `-l` 2.1 | option | Language code (if tokenizer required). | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). | +| Argument | Type | Description | +| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------ | +| `input_file` | positional | Input file. | +| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. | +| `--file-type`, `-t` 2.1 | option | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. | +| `--converter`, `-c` 2 | option | Name of converter to use (see below). | +| `--n-sents`, `-n` | option | Number of sentences per document. | +| `--seg-sents`, `-s` 2.2 | flag | Segment sentences (for `-c ner`) | +| `--model`, `-b` 2.2 | option | Model for parser-based sentence segmentation (for `-s`) | +| `--morphology`, `-m` | option | Enable appending morphology to tags. | +| `--lang`, `-l` 2.1 | option | Language code (if tokenizer required). | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | binary | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). | -### Output file types {new="2.1"} +### Converters -All output files generated by this command are compatible with -[`spacy train`](/api/cli#train). - -| ID | Description | -| ------- | -------------------------- | -| `json` | Regular JSON (default). | -| `jsonl` | Newline-delimited JSON. | -| `msg` | Binary MessagePack format. | - -### Converter options - -| ID | Description | -| ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension and file content (default). | -| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | -| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | -| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | -| `jsonl` | NER data formatted as JSONL with one dict per line and a `"text"` and `"spans"` key. This is also the format exported by the [Prodigy](https://prodi.gy) annotation tool. See [sample data](https://raw.githubusercontent.com/explosion/projects/master/ner-fashion-brands/fashion_brands_training.jsonl). | +| ID | Description | +| ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension and file content (default). | +| `json` | JSON-formatted training data used in spaCy v2.x and produced by [`docs2json`](/api/goldparse#docs_to_json). | +| `conll` | Universal Dependencies `.conllu` or `.conll` format. | +| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | +| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | ## Debug data {#debug-data new="2.2"} @@ -160,7 +149,7 @@ $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pi | `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | | `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | | `--verbose`, `-V` | flag | Print additional information and explanations. | -| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | +| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |