mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Support train dict format as JSONL (#4471)
* Support train dict format as JSONL * Add (overly simple) check for dict vs. tuple to read JSONL lines as either train dicts or train tuples * Extend JSON/JSONL roundtrip conversion tests using `docs_to_json()` and `GoldCorpus.train_tuples` * Revert docs to default JSON output with convert
This commit is contained in:
parent
7fc39f124c
commit
8516e9d53b
|
@ -176,6 +176,11 @@ class GoldCorpus(object):
|
||||||
gold_tuples = read_json_file(loc)
|
gold_tuples = read_json_file(loc)
|
||||||
elif loc.parts[-1].endswith("jsonl"):
|
elif loc.parts[-1].endswith("jsonl"):
|
||||||
gold_tuples = srsly.read_jsonl(loc)
|
gold_tuples = srsly.read_jsonl(loc)
|
||||||
|
first_gold_tuple = next(gold_tuples)
|
||||||
|
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
|
||||||
|
# TODO: proper format checks with schemas
|
||||||
|
if isinstance(first_gold_tuple, dict):
|
||||||
|
gold_tuples = read_json_object(gold_tuples)
|
||||||
elif loc.parts[-1].endswith("msg"):
|
elif loc.parts[-1].endswith("msg"):
|
||||||
gold_tuples = srsly.read_msgpack(loc)
|
gold_tuples = srsly.read_msgpack(loc)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -99,14 +99,23 @@ def test_iob_to_biluo():
|
||||||
|
|
||||||
def test_roundtrip_docs_to_json():
|
def test_roundtrip_docs_to_json():
|
||||||
text = "I flew to Silicon Valley via London."
|
text = "I flew to Silicon Valley via London."
|
||||||
|
tags = ['PRP', 'VBD', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
|
||||||
|
heads = [1, 1, 1, 4, 2, 1, 5, 1]
|
||||||
|
deps = ['nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
|
||||||
|
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||||
nlp = English()
|
nlp = English()
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
|
for i in range(len(tags)):
|
||||||
|
doc[i].tag_ = tags[i]
|
||||||
|
doc[i].dep_ = deps[i]
|
||||||
|
doc[i].head = doc[heads[i]]
|
||||||
|
doc.ents = spans_from_biluo_tags(doc, biluo_tags)
|
||||||
doc.cats = cats
|
doc.cats = cats
|
||||||
doc[0].is_sent_start = True
|
doc.is_tagged = True
|
||||||
for i in range(1, len(doc)):
|
doc.is_parsed = True
|
||||||
doc[i].is_sent_start = False
|
|
||||||
|
|
||||||
|
# roundtrip to JSON
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
json_file = tmpdir / "roundtrip.json"
|
json_file = tmpdir / "roundtrip.json"
|
||||||
srsly.write_json(json_file, [docs_to_json(doc)])
|
srsly.write_json(json_file, [docs_to_json(doc)])
|
||||||
|
@ -116,6 +125,52 @@ def test_roundtrip_docs_to_json():
|
||||||
|
|
||||||
assert len(doc) == goldcorpus.count_train()
|
assert len(doc) == goldcorpus.count_train()
|
||||||
assert text == reloaded_doc.text
|
assert text == reloaded_doc.text
|
||||||
|
assert tags == goldparse.tags
|
||||||
|
assert deps == goldparse.labels
|
||||||
|
assert heads == goldparse.heads
|
||||||
|
assert biluo_tags == goldparse.ner
|
||||||
|
assert "TRAVEL" in goldparse.cats
|
||||||
|
assert "BAKING" in goldparse.cats
|
||||||
|
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
|
||||||
|
assert cats["BAKING"] == goldparse.cats["BAKING"]
|
||||||
|
|
||||||
|
# roundtrip to JSONL train dicts
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
jsonl_file = tmpdir / "roundtrip.jsonl"
|
||||||
|
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
||||||
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
|
||||||
|
reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
|
||||||
|
|
||||||
|
assert len(doc) == goldcorpus.count_train()
|
||||||
|
assert text == reloaded_doc.text
|
||||||
|
assert tags == goldparse.tags
|
||||||
|
assert deps == goldparse.labels
|
||||||
|
assert heads == goldparse.heads
|
||||||
|
assert biluo_tags == goldparse.ner
|
||||||
|
assert "TRAVEL" in goldparse.cats
|
||||||
|
assert "BAKING" in goldparse.cats
|
||||||
|
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
|
||||||
|
assert cats["BAKING"] == goldparse.cats["BAKING"]
|
||||||
|
|
||||||
|
# roundtrip to JSONL tuples
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
jsonl_file = tmpdir / "roundtrip.jsonl"
|
||||||
|
# write to JSONL train dicts
|
||||||
|
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
||||||
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
# load and rewrite as JSONL tuples
|
||||||
|
srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples)
|
||||||
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
|
||||||
|
reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
|
||||||
|
|
||||||
|
assert len(doc) == goldcorpus.count_train()
|
||||||
|
assert text == reloaded_doc.text
|
||||||
|
assert tags == goldparse.tags
|
||||||
|
assert deps == goldparse.labels
|
||||||
|
assert heads == goldparse.heads
|
||||||
|
assert biluo_tags == goldparse.ner
|
||||||
assert "TRAVEL" in goldparse.cats
|
assert "TRAVEL" in goldparse.cats
|
||||||
assert "BAKING" in goldparse.cats
|
assert "BAKING" in goldparse.cats
|
||||||
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
|
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
|
||||||
|
|
|
@ -155,21 +155,14 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
|
||||||
|
|
||||||
### Output file types {new="2.1"}
|
### Output file types {new="2.1"}
|
||||||
|
|
||||||
> #### Which format should I choose?
|
|
||||||
>
|
|
||||||
> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means
|
|
||||||
> that there's one JSON object per line. Unlike a regular JSON file, it can also
|
|
||||||
> be read in line-by-line and you won't have to parse the _entire file_ first.
|
|
||||||
> This makes it a very convenient format for larger corpora.
|
|
||||||
|
|
||||||
All output files generated by this command are compatible with
|
All output files generated by this command are compatible with
|
||||||
[`spacy train`](/api/cli#train).
|
[`spacy train`](/api/cli#train).
|
||||||
|
|
||||||
| ID | Description |
|
| ID | Description |
|
||||||
| ------- | --------------------------------- |
|
| ------- | -------------------------- |
|
||||||
| `jsonl` | Newline-delimited JSON (default). |
|
| `json` | Regular JSON (default). |
|
||||||
| `json` | Regular JSON. |
|
| `jsonl` | Newline-delimited JSON. |
|
||||||
| `msg` | Binary MessagePack format. |
|
| `msg` | Binary MessagePack format. |
|
||||||
|
|
||||||
### Converter options
|
### Converter options
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user