Merge branch 'develop' into nightly.spacy.io

2025-12-22 09:34:23 +03:00 · 2020-07-03 16:48:27 +02:00 · 2020-07-03 16:48:27 +02:00 · fe224dc2dd
commit fe224dc2dd
parent 06f1ecb308 cdf9ee1716
4 changed files with 50 additions and 50 deletions
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -28,7 +28,7 @@ CONVERTERS = {


 # File types that can be written to stdout
-FILE_TYPES_STDOUT = ("json")
+FILE_TYPES_STDOUT = ("json",)


 class FileTypes(str, Enum):
@ -86,20 +86,20 @@ def convert_cli(


 def convert(
-        input_path: Path,
-        output_dir: Path,
-        *,
-        file_type: str = "json",
-        n_sents: int = 1,
-        seg_sents: bool = False,
-        model: Optional[str] = None,
-        morphology: bool = False,
-        merge_subtokens: bool = False,
-        converter: str = "auto",
-        ner_map: Optional[Path] = None,
-        lang: Optional[str] = None,
-        silent: bool = True,
-        msg: Optional[Path] = None,
+    input_path: Path,
+    output_dir: Path,
+    *,
+    file_type: str = "json",
+    n_sents: int = 1,
+    seg_sents: bool = False,
+    model: Optional[str] = None,
+    morphology: bool = False,
+    merge_subtokens: bool = False,
+    converter: str = "auto",
+    ner_map: Optional[Path] = None,
+    lang: Optional[str] = None,
+    silent: bool = True,
+    msg: Optional[Path] = None,
 ) -> None:
    if not msg:
        msg = Printer(no_print=silent)
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -105,40 +105,29 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
 [--n-sents] [--morphology] [--lang]
 ```

-| Argument                                         | Type       | Description                                                                                       |
-| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- |
-| `input_file`                                     | positional | Input file.                                                                                       |
-| `output_dir`                                     | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
-| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option     | Type of file to create (see below).                                                               |
-| `--converter`, `-c` <Tag variant="new">2</Tag>   | option     | Name of converter to use (see below).                                                             |
-| `--n-sents`, `-n`                                | option     | Number of sentences per document.                                                                 |
-| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag       | Segment sentences (for `-c ner`)                                                                  |
-| `--model`, `-b` <Tag variant="new">2.2</Tag>     | option     | Model for parser-based sentence segmentation (for `-s`)                                           |
-| `--morphology`, `-m`                             | option     | Enable appending morphology to tags.                                                              |
-| `--lang`, `-l` <Tag variant="new">2.1</Tag>      | option     | Language code (if tokenizer required).                                                            |
-| `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                        |
-| **CREATES**                                      | JSON       | Data in spaCy's [JSON format](/api/annotation#json-input).                                        |
+| Argument                                         | Type       | Description                                                                                                              |
+| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------ |
+| `input_file`                                     | positional | Input file.                                                                                                              |
+| `output_dir`                                     | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`.                        |
+| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option     | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. |
+| `--converter`, `-c` <Tag variant="new">2</Tag>   | option     | Name of converter to use (see below).                                                                                    |
+| `--n-sents`, `-n`                                | option     | Number of sentences per document.                                                                                        |
+| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag       | Segment sentences (for `-c ner`)                                                                                         |
+| `--model`, `-b` <Tag variant="new">2.2</Tag>     | option     | Model for parser-based sentence segmentation (for `-s`)                                                                  |
+| `--morphology`, `-m`                             | option     | Enable appending morphology to tags.                                                                                     |
+| `--lang`, `-l` <Tag variant="new">2.1</Tag>      | option     | Language code (if tokenizer required).                                                                                   |
+| `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                                               |
+| **CREATES**                                      | binary     | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train).                      |

-### Output file types {new="2.1"}
+### Converters

-All output files generated by this command are compatible with
-[`spacy train`](/api/cli#train).
-
-| ID      | Description                |
-| ------- | -------------------------- |
-| `json`  | Regular JSON (default).    |
-| `jsonl` | Newline-delimited JSON.    |
-| `msg`   | Binary MessagePack format. |
-
-### Converter options
-
-| ID                             | Description                                                                                                                                                                                                                                                                                                                                                                                    |
-| ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `auto`                         | Automatically pick converter based on file extension and file content (default).                                                                                                                                                                                                                                                                                                               |
-| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format.                                                                                                                                                                                                                                                                                                                                           |
-| `ner`                          | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
-| `iob`                          | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data).                                                                                                                              |
-| `jsonl`                        | NER data formatted as JSONL with one dict per line and a `"text"` and `"spans"` key. This is also the format exported by the [Prodigy](https://prodi.gy) annotation tool. See [sample data](https://raw.githubusercontent.com/explosion/projects/master/ner-fashion-brands/fashion_brands_training.jsonl).                                                                                     |
+| ID      | Description                                                                                                                                                                                                                                                                                                                                                                                    |
+| ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `auto`  | Automatically pick converter based on file extension and file content (default).                                                                                                                                                                                                                                                                                                               |
+| `json`  | JSON-formatted training data used in spaCy v2.x and produced by [`docs2json`](/api/goldparse#docs_to_json).                                                                                                                                                                                                                                                                                    |
+| `conll` | Universal Dependencies `.conllu` or `.conll` format.                                                                                                                                                                                                                                                                                                                                           |
+| `ner`   | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
+| `iob`   | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data).                                                                                                                              |

 ## Debug data {#debug-data new="2.2"}

@ -160,7 +149,7 @@ $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pi
 | `--pipeline`, `-p`                                     | option     | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.          |
 | `--ignore-warnings`, `-IW`                             | flag       | Ignore warnings, only show stats and errors.                                                       |
 | `--verbose`, `-V`                                      | flag       | Print additional information and explanations.                                                     |
-| --no-format, `-NF`                                     | flag       | Don't pretty-print the results. Use this if you want to write to a file.                           |
+| `--no-format`, `-NF`                                   | flag       | Don't pretty-print the results. Use this if you want to write to a file.                           |

 <Accordion title="Example output">

--- a/website/docs/api/example.md
+++ b/website/docs/api/example.md
@ -0,0 +1,10 @@
+---
+title: Example
+teaser: A training example
+tag: class
+source: spacy/gold/example.pyx
+---
+
+<!-- TODO: -->
+
+## Example.\_\_init\_\_ {#init tag="method"}
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@ -67,7 +67,8 @@
                    { "text": "Doc", "url": "/api/doc" },
                    { "text": "Token", "url": "/api/token" },
                    { "text": "Span", "url": "/api/span" },
-                    { "text": "Lexeme", "url": "/api/lexeme" }
+                    { "text": "Lexeme", "url": "/api/lexeme" },
+                    { "text": "Example", "url": "/api/example" }
                ]
            },
            {