mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
💫 Improve converters and training data file formats (#3374)
* Populate converter argument info automatically * Add conversion option for msgpack * Update docs * Allow reading training data from JSONL
This commit is contained in:
parent
296446a1c8
commit
76764fcf59
|
@ -23,15 +23,16 @@ CONVERTERS = {
|
|||
}
|
||||
|
||||
# File types
|
||||
FILE_TYPES = ("json", "jsonl")
|
||||
FILE_TYPES = ("json", "jsonl", "msg")
|
||||
FILE_TYPES_STDOUT = ("json", "jsonl")
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
input_file=("Input file", "positional", None, str),
|
||||
output_dir=("Output directory for converted file", "positional", None, str),
|
||||
file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
|
||||
output_dir=("Output directory. '-' for stdout.", "positional", None, str),
|
||||
file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
|
||||
n_sents=("Number of sentences per doc", "option", "n", int),
|
||||
converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
|
||||
converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
|
||||
lang=("Language (if tokenizer required)", "option", "l", str),
|
||||
morphology=("Enable appending morphology to tags", "flag", "m", bool),
|
||||
)
|
||||
|
@ -58,6 +59,13 @@ def convert(
|
|||
"Supported file types: '{}'".format(", ".join(FILE_TYPES)),
|
||||
exits=1,
|
||||
)
|
||||
if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
|
||||
# TODO: support msgpack via stdout in srsly?
|
||||
msg.fail(
|
||||
"Can't write .{} data to stdout.".format(file_type),
|
||||
"Please specify an output directory.",
|
||||
exits=1,
|
||||
)
|
||||
if not input_path.exists():
|
||||
msg.fail("Input file not found", input_path, exits=1)
|
||||
if output_dir != "-" and not Path(output_dir).exists():
|
||||
|
@ -78,6 +86,8 @@ def convert(
|
|||
srsly.write_json(output_file, data)
|
||||
elif file_type == "jsonl":
|
||||
srsly.write_jsonl(output_file, data)
|
||||
elif file_type == "msg":
|
||||
srsly.write_msgpack(output_file, data)
|
||||
msg.good("Generated output file ({} documents)".format(len(data)), output_file)
|
||||
else:
|
||||
# Print to stdout
|
||||
|
|
|
@ -342,7 +342,7 @@ class Errors(object):
|
|||
"equal to span length ({span_len}).")
|
||||
E122 = ("Cannot find token to be split. Did it get merged?")
|
||||
E123 = ("Cannot find head of token to be split. Did it get merged?")
|
||||
E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg")
|
||||
E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
|
||||
E125 = ("Unexpected value: {value}")
|
||||
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||
|
|
|
@ -153,10 +153,13 @@ class GoldCorpus(object):
|
|||
loc = util.ensure_path(loc)
|
||||
if loc.parts[-1].endswith("json"):
|
||||
gold_tuples = read_json_file(loc)
|
||||
elif loc.parts[-1].endswith("jsonl"):
|
||||
gold_tuples = srsly.read_jsonl(loc)
|
||||
elif loc.parts[-1].endswith("msg"):
|
||||
gold_tuples = srsly.read_msgpack(loc)
|
||||
else:
|
||||
raise ValueError(Errors.E124.format(path=path2str(loc)))
|
||||
supported = ("json", "jsonl", "msg")
|
||||
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
|
||||
for item in gold_tuples:
|
||||
yield item
|
||||
i += len(item[1])
|
||||
|
|
|
@ -134,28 +134,50 @@ converter can be specified on the command line, or chosen based on the file
|
|||
extension of the input file.
|
||||
|
||||
```bash
|
||||
$ python -m spacy convert [input_file] [output_dir] [--converter] [--n-sents]
|
||||
[--morphology]
|
||||
$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
|
||||
[--n-sents] [--morphology] [--lang]
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------------------------------------------- | ---------- | ---------------------------------------------------------- |
|
||||
| `input_file` | positional | Input file. |
|
||||
| `output_dir` | positional | Output directory for converted JSON file. |
|
||||
| `converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
|
||||
| `--n-sents`, `-n` | option | Number of sentences per document. |
|
||||
| `--morphology`, `-m` | option | Enable appending morphology to tags. |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). |
|
||||
| Argument | Type | Description |
|
||||
| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- |
|
||||
| `input_file` | positional | Input file. |
|
||||
| `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
|
||||
| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option | Type of file to create (see below). |
|
||||
| `--converter`, `-c` <Tag variant="new">2</Tag> | option | Name of converter to use (see below). |
|
||||
| `--n-sents`, `-n` | option | Number of sentences per document. |
|
||||
| `--morphology`, `-m` | option | Enable appending morphology to tags. |
|
||||
| `--lang`, `-l` <Tag variant="new">2.1</Tag> | option | Language code (if tokenizer required). |
|
||||
| `--help`, `-h` | flag | Show help message and available arguments. |
|
||||
| **CREATES** | JSON | Data in spaCy's [JSON format](/api/annotation#json-input). |
|
||||
|
||||
The following file format converters are available:
|
||||
### Output file types {new="2.1"}
|
||||
|
||||
| ID | Description |
|
||||
| ----------------- | --------------------------------------------------------------- |
|
||||
| `auto` | Automatically pick converter based on file extension (default). |
|
||||
| `conllu`, `conll` | Universal Dependencies `.conllu` or `.conll` format. |
|
||||
| `ner` | Tab-based named entity recognition format. |
|
||||
| `iob` | IOB or IOB2 named entity recognition format. |
|
||||
> #### Which format should I choose?
|
||||
>
|
||||
> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means
|
||||
> that there's one JSON object per line. Unlike a regular JSON file, it can also
|
||||
> be read in line-by-line and you won't have to parse the _entire file_ first.
|
||||
> This makes it a very convenient format for larger corpora.
|
||||
|
||||
All output files generated by this command are compatible with
|
||||
[`spacy train`](/api/cli#train).
|
||||
|
||||
| ID | Description |
|
||||
| ------- | --------------------------------- |
|
||||
| `jsonl` | Newline-delimited JSON (default). |
|
||||
| `json` | Regular JSON. |
|
||||
| `msg` | Binary MessagePack format. |
|
||||
|
||||
### Converter options
|
||||
|
||||
<!-- TODO: document jsonl option – maybe update it? -->
|
||||
|
||||
| ID | Description |
|
||||
| ------------------------------ | --------------------------------------------------------------- |
|
||||
| `auto` | Automatically pick converter based on file extension (default). |
|
||||
| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
|
||||
| `ner` | Tab-based named entity recognition format. |
|
||||
| `iob` | IOB or IOB2 named entity recognition format. |
|
||||
|
||||
## Train {#train}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user