Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-07-03 16:48:27 +02:00
commit fe224dc2dd
4 changed files with 50 additions and 50 deletions

View File

@ -28,7 +28,7 @@ CONVERTERS = {
# File types that can be written to stdout # File types that can be written to stdout
FILE_TYPES_STDOUT = ("json") FILE_TYPES_STDOUT = ("json",)
class FileTypes(str, Enum): class FileTypes(str, Enum):
@ -86,20 +86,20 @@ def convert_cli(
def convert( def convert(
input_path: Path, input_path: Path,
output_dir: Path, output_dir: Path,
*, *,
file_type: str = "json", file_type: str = "json",
n_sents: int = 1, n_sents: int = 1,
seg_sents: bool = False, seg_sents: bool = False,
model: Optional[str] = None, model: Optional[str] = None,
morphology: bool = False, morphology: bool = False,
merge_subtokens: bool = False, merge_subtokens: bool = False,
converter: str = "auto", converter: str = "auto",
ner_map: Optional[Path] = None, ner_map: Optional[Path] = None,
lang: Optional[str] = None, lang: Optional[str] = None,
silent: bool = True, silent: bool = True,
msg: Optional[Path] = None, msg: Optional[Path] = None,
) -> None: ) -> None:
if not msg: if not msg:
msg = Printer(no_print=silent) msg = Printer(no_print=silent)
@ -149,7 +149,7 @@ def _write_docs_to_file(docs, output_file, output_type):
data = DocBin(docs=docs, store_user_data=True).to_bytes() data = DocBin(docs=docs, store_user_data=True).to_bytes()
with output_file.open("wb") as file_: with output_file.open("wb") as file_:
file_.write(data) file_.write(data)
def autodetect_ner_format(input_data: str) -> str: def autodetect_ner_format(input_data: str) -> str:
# guess format from the first 20 lines # guess format from the first 20 lines

View File

@ -105,40 +105,29 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
[--n-sents] [--morphology] [--lang] [--n-sents] [--morphology] [--lang]
``` ```
| Argument | Type | Description | | Argument | Type | Description |
| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- | | ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------ |
| `input_file` | positional | Input file. | | `input_file` | positional | Input file. |
| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. | | `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option | Type of file to create (see below). | | `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. |
| `--converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). | | `--converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
| `--n-sents`, `-n` | option | Number of sentences per document. | | `--n-sents`, `-n` | option | Number of sentences per document. |
| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag | Segment sentences (for `-c ner`) | | `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag | Segment sentences (for `-c ner`) |
| `--model`, `-b` <Tag variant="new">2.2</Tag> | option | Model for parser-based sentence segmentation (for `-s`) | | `--model`, `-b` <Tag variant="new">2.2</Tag> | option | Model for parser-based sentence segmentation (for `-s`) |
| `--morphology`, `-m` | option | Enable appending morphology to tags. | | `--morphology`, `-m` | option | Enable appending morphology to tags. |
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | option | Language code (if tokenizer required). | | `--lang`, `-l` <Tag variant="new">2.1</Tag> | option | Language code (if tokenizer required). |
| `--help`, `-h` | flag | Show help message and available arguments. | | `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). | | **CREATES** | binary | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). |
### Output file types {new="2.1"} ### Converters
All output files generated by this command are compatible with | ID | Description |
[`spacy train`](/api/cli#train). | ------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `auto` | Automatically pick converter based on file extension and file content (default). |
| ID | Description | | `json` | JSON-formatted training data used in spaCy v2.x and produced by [`docs2json`](/api/goldparse#docs_to_json). |
| ------- | -------------------------- | | `conll` | Universal Dependencies `.conllu` or `.conll` format. |
| `json` | Regular JSON (default). | | `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
| `jsonl` | Newline-delimited JSON. | | `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
| `msg` | Binary MessagePack format. |
### Converter options
| ID | Description |
| ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `auto` | Automatically pick converter based on file extension and file content (default). |
| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
| `jsonl` | NER data formatted as JSONL with one dict per line and a `"text"` and `"spans"` key. This is also the format exported by the [Prodigy](https://prodi.gy) annotation tool. See [sample data](https://raw.githubusercontent.com/explosion/projects/master/ner-fashion-brands/fashion_brands_training.jsonl). |
## Debug data {#debug-data new="2.2"} ## Debug data {#debug-data new="2.2"}
@ -160,7 +149,7 @@ $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pi
| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | | `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | | `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
| `--verbose`, `-V` | flag | Print additional information and explanations. | | `--verbose`, `-V` | flag | Print additional information and explanations. |
| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | | `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
<Accordion title="Example output"> <Accordion title="Example output">

View File

@ -0,0 +1,10 @@
---
title: Example
teaser: A training example
tag: class
source: spacy/gold/example.pyx
---
<!-- TODO: -->
## Example.\_\_init\_\_ {#init tag="method"}

View File

@ -67,7 +67,8 @@
{ "text": "Doc", "url": "/api/doc" }, { "text": "Doc", "url": "/api/doc" },
{ "text": "Token", "url": "/api/token" }, { "text": "Token", "url": "/api/token" },
{ "text": "Span", "url": "/api/span" }, { "text": "Span", "url": "/api/span" },
{ "text": "Lexeme", "url": "/api/lexeme" } { "text": "Lexeme", "url": "/api/lexeme" },
{ "text": "Example", "url": "/api/example" }
] ]
}, },
{ {