Merge branch 'develop' into nightly.spacy.io

2025-07-01 02:13:07 +03:00 · 2020-07-03 16:48:27 +02:00 · 2020-07-03 16:48:27 +02:00 · fe224dc2dd
commit fe224dc2dd
parent 06f1ecb308 cdf9ee1716
4 changed files with 50 additions and 50 deletions
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -28,7 +28,7 @@ CONVERTERS = {
 # File types that can be written to stdout
-FILE_TYPES_STDOUT = ("json")
+FILE_TYPES_STDOUT = ("json",)
 class FileTypes(str, Enum):
@ -86,20 +86,20 @@ def convert_cli(
 def convert(
-        input_path: Path,
+    input_path: Path,
-        output_dir: Path,
+    output_dir: Path,
-        *,
+    *,
-        file_type: str = "json",
+    file_type: str = "json",
-        n_sents: int = 1,
+    n_sents: int = 1,
-        seg_sents: bool = False,
+    seg_sents: bool = False,
-        model: Optional[str] = None,
+    model: Optional[str] = None,
-        morphology: bool = False,
+    morphology: bool = False,
-        merge_subtokens: bool = False,
+    merge_subtokens: bool = False,
-        converter: str = "auto",
+    converter: str = "auto",
-        ner_map: Optional[Path] = None,
+    ner_map: Optional[Path] = None,
-        lang: Optional[str] = None,
+    lang: Optional[str] = None,
-        silent: bool = True,
+    silent: bool = True,
-        msg: Optional[Path] = None,
+    msg: Optional[Path] = None,
 ) -> None:
    if not msg:
        msg = Printer(no_print=silent)
@ -149,7 +149,7 @@ def _write_docs_to_file(docs, output_file, output_type):
        data = DocBin(docs=docs, store_user_data=True).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)
- 
+
 def autodetect_ner_format(input_data: str) -> str:
    # guess format from the first 20 lines
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -105,40 +105,29 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
 [--n-sents] [--morphology] [--lang]
 ```
-| Argument                                         | Type       | Description                                                                                       |
+| Argument                                         | Type       | Description                                                                                                              |
-| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- |
+| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `input_file`                                     | positional | Input file.                                                                                       |
+| `input_file`                                     | positional | Input file.                                                                                                              |
-| `output_dir`                                     | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
+| `output_dir`                                     | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`.                        |
-| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option     | Type of file to create (see below).                                                               |
+| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option     | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. |
-| `--converter`, `-c` <Tag variant="new">2</Tag>   | option     | Name of converter to use (see below).                                                             |
+| `--converter`, `-c` <Tag variant="new">2</Tag>   | option     | Name of converter to use (see below).                                                                                    |
-| `--n-sents`, `-n`                                | option     | Number of sentences per document.                                                                 |
+| `--n-sents`, `-n`                                | option     | Number of sentences per document.                                                                                        |
-| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag       | Segment sentences (for `-c ner`)                                                                  |
+| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag       | Segment sentences (for `-c ner`)                                                                                         |
-| `--model`, `-b` <Tag variant="new">2.2</Tag>     | option     | Model for parser-based sentence segmentation (for `-s`)                                           |
+| `--model`, `-b` <Tag variant="new">2.2</Tag>     | option     | Model for parser-based sentence segmentation (for `-s`)                                                                  |
-| `--morphology`, `-m`                             | option     | Enable appending morphology to tags.                                                              |
+| `--morphology`, `-m`                             | option     | Enable appending morphology to tags.                                                                                     |
-| `--lang`, `-l` <Tag variant="new">2.1</Tag>      | option     | Language code (if tokenizer required).                                                            |
+| `--lang`, `-l` <Tag variant="new">2.1</Tag>      | option     | Language code (if tokenizer required).                                                                                   |
-| `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                        |
+| `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                                               |
-| **CREATES**                                      | JSON       | Data in spaCy's [JSON format](/api/annotation#json-input).                                        |
+| **CREATES**                                      | binary     | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train).                      |
-### Output file types {new="2.1"}
+### Converters
-All output files generated by this command are compatible with
+| ID      | Description                                                                                                                                                                                                                                                                                                                                                                                    |
-[`spacy train`](/api/cli#train).
+| ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-
+| `auto`  | Automatically pick converter based on file extension and file content (default).                                                                                                                                                                                                                                                                                                               |
-| ID      | Description                |
+| `json`  | JSON-formatted training data used in spaCy v2.x and produced by [`docs2json`](/api/goldparse#docs_to_json).                                                                                                                                                                                                                                                                                    |
-| ------- | -------------------------- |
+| `conll` | Universal Dependencies `.conllu` or `.conll` format.                                                                                                                                                                                                                                                                                                                                           |
-| `json`  | Regular JSON (default).    |
+| `ner`   | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
-| `jsonl` | Newline-delimited JSON.    |
+| `iob`   | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data).                                                                                                                              |
 | `msg`   | Binary MessagePack format. |
 ### Converter options
 | ID                             | Description                                                                                                                                                                                                                                                                                                                                                                                    |
 | ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `auto`                         | Automatically pick converter based on file extension and file content (default).                                                                                                                                                                                                                                                                                                               |
 | `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format.                                                                                                                                                                                                                                                                                                                                           |
 | `ner`                          | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
 | `iob`                          | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data).                                                                                                                              |
 | `jsonl`                        | NER data formatted as JSONL with one dict per line and a `"text"` and `"spans"` key. This is also the format exported by the [Prodigy](https://prodi.gy) annotation tool. See [sample data](https://raw.githubusercontent.com/explosion/projects/master/ner-fashion-brands/fashion_brands_training.jsonl).                                                                                     |
 ## Debug data {#debug-data new="2.2"}
@ -160,7 +149,7 @@ $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pi
 | `--pipeline`, `-p`                                     | option     | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.          |
 | `--ignore-warnings`, `-IW`                             | flag       | Ignore warnings, only show stats and errors.                                                       |
 | `--verbose`, `-V`                                      | flag       | Print additional information and explanations.                                                     |
-| --no-format, `-NF`                                     | flag       | Don't pretty-print the results. Use this if you want to write to a file.                           |
+| `--no-format`, `-NF`                                   | flag       | Don't pretty-print the results. Use this if you want to write to a file.                           |
 <Accordion title="Example output">
--- a/website/docs/api/example.md
+++ b/website/docs/api/example.md
@ -0,0 +1,10 @@
 ---
 title: Example
 teaser: A training example
 tag: class
 source: spacy/gold/example.pyx
 ---
 <!-- TODO: -->
 ## Example.\_\_init\_\_ {#init tag="method"}
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@ -67,7 +67,8 @@
                    { "text": "Doc", "url": "/api/doc" },
                    { "text": "Token", "url": "/api/token" },
                    { "text": "Span", "url": "/api/span" },
-                    { "text": "Lexeme", "url": "/api/lexeme" }
+                    { "text": "Lexeme", "url": "/api/lexeme" },
                    { "text": "Example", "url": "/api/example" }
                ]
            },
            {