diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 56f38766a..af6ef147c 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -28,7 +28,7 @@ CONVERTERS = { # File types that can be written to stdout -FILE_TYPES_STDOUT = ("json") +FILE_TYPES_STDOUT = ("json",) class FileTypes(str, Enum): @@ -86,20 +86,20 @@ def convert_cli( def convert( - input_path: Path, - output_dir: Path, - *, - file_type: str = "json", - n_sents: int = 1, - seg_sents: bool = False, - model: Optional[str] = None, - morphology: bool = False, - merge_subtokens: bool = False, - converter: str = "auto", - ner_map: Optional[Path] = None, - lang: Optional[str] = None, - silent: bool = True, - msg: Optional[Path] = None, + input_path: Path, + output_dir: Path, + *, + file_type: str = "json", + n_sents: int = 1, + seg_sents: bool = False, + model: Optional[str] = None, + morphology: bool = False, + merge_subtokens: bool = False, + converter: str = "auto", + ner_map: Optional[Path] = None, + lang: Optional[str] = None, + silent: bool = True, + msg: Optional[Path] = None, ) -> None: if not msg: msg = Printer(no_print=silent) @@ -149,7 +149,7 @@ def _write_docs_to_file(docs, output_file, output_type): data = DocBin(docs=docs, store_user_data=True).to_bytes() with output_file.open("wb") as file_: file_.write(data) - + def autodetect_ner_format(input_data: str) -> str: # guess format from the first 20 lines diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index b26e034ce..c90d7c69c 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -105,40 +105,29 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter] [--n-sents] [--morphology] [--lang] ``` -| Argument | Type | Description | -| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- | -| `input_file` | positional | Input file. | -| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. | -| `--file-type`, `-t` 2.1 | option | Type of file to create (see below). | -| `--converter`, `-c` 2 | option | Name of converter to use (see below). | -| `--n-sents`, `-n` | option | Number of sentences per document. | -| `--seg-sents`, `-s` 2.2 | flag | Segment sentences (for `-c ner`) | -| `--model`, `-b` 2.2 | option | Model for parser-based sentence segmentation (for `-s`) | -| `--morphology`, `-m` | option | Enable appending morphology to tags. | -| `--lang`, `-l` 2.1 | option | Language code (if tokenizer required). | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). | +| Argument | Type | Description | +| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------ | +| `input_file` | positional | Input file. | +| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. | +| `--file-type`, `-t` 2.1 | option | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. | +| `--converter`, `-c` 2 | option | Name of converter to use (see below). | +| `--n-sents`, `-n` | option | Number of sentences per document. | +| `--seg-sents`, `-s` 2.2 | flag | Segment sentences (for `-c ner`) | +| `--model`, `-b` 2.2 | option | Model for parser-based sentence segmentation (for `-s`) | +| `--morphology`, `-m` | option | Enable appending morphology to tags. | +| `--lang`, `-l` 2.1 | option | Language code (if tokenizer required). | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | binary | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). | -### Output file types {new="2.1"} +### Converters -All output files generated by this command are compatible with -[`spacy train`](/api/cli#train). - -| ID | Description | -| ------- | -------------------------- | -| `json` | Regular JSON (default). | -| `jsonl` | Newline-delimited JSON. | -| `msg` | Binary MessagePack format. | - -### Converter options - -| ID | Description | -| ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `auto` | Automatically pick converter based on file extension and file content (default). | -| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | -| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | -| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | -| `jsonl` | NER data formatted as JSONL with one dict per line and a `"text"` and `"spans"` key. This is also the format exported by the [Prodigy](https://prodi.gy) annotation tool. See [sample data](https://raw.githubusercontent.com/explosion/projects/master/ner-fashion-brands/fashion_brands_training.jsonl). | +| ID | Description | +| ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `auto` | Automatically pick converter based on file extension and file content (default). | +| `json` | JSON-formatted training data used in spaCy v2.x and produced by [`docs2json`](/api/goldparse#docs_to_json). | +| `conll` | Universal Dependencies `.conllu` or `.conll` format. | +| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | +| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | ## Debug data {#debug-data new="2.2"} @@ -160,7 +149,7 @@ $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pi | `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | | `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | | `--verbose`, `-V` | flag | Print additional information and explanations. | -| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | +| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | diff --git a/website/docs/api/example.md b/website/docs/api/example.md new file mode 100644 index 000000000..9dabaf851 --- /dev/null +++ b/website/docs/api/example.md @@ -0,0 +1,10 @@ +--- +title: Example +teaser: A training example +tag: class +source: spacy/gold/example.pyx +--- + + + +## Example.\_\_init\_\_ {#init tag="method"} diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 015051f95..165c02a29 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -67,7 +67,8 @@ { "text": "Doc", "url": "/api/doc" }, { "text": "Token", "url": "/api/token" }, { "text": "Span", "url": "/api/span" }, - { "text": "Lexeme", "url": "/api/lexeme" } + { "text": "Lexeme", "url": "/api/lexeme" }, + { "text": "Example", "url": "/api/example" } ] }, {