diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 7bf89c84a..3a5f508b4 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -176,6 +176,11 @@ class GoldCorpus(object): gold_tuples = read_json_file(loc) elif loc.parts[-1].endswith("jsonl"): gold_tuples = srsly.read_jsonl(loc) + first_gold_tuple = next(gold_tuples) + gold_tuples = itertools.chain([first_gold_tuple], gold_tuples) + # TODO: proper format checks with schemas + if isinstance(first_gold_tuple, dict): + gold_tuples = read_json_object(gold_tuples) elif loc.parts[-1].endswith("msg"): gold_tuples = srsly.read_msgpack(loc) else: diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 234a91443..01eaa51be 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -99,14 +99,23 @@ def test_iob_to_biluo(): def test_roundtrip_docs_to_json(): text = "I flew to Silicon Valley via London." + tags = ['PRP', 'VBD', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.'] + heads = [1, 1, 1, 4, 2, 1, 5, 1] + deps = ['nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct'] + biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] cats = {"TRAVEL": 1.0, "BAKING": 0.0} nlp = English() doc = nlp(text) + for i in range(len(tags)): + doc[i].tag_ = tags[i] + doc[i].dep_ = deps[i] + doc[i].head = doc[heads[i]] + doc.ents = spans_from_biluo_tags(doc, biluo_tags) doc.cats = cats - doc[0].is_sent_start = True - for i in range(1, len(doc)): - doc[i].is_sent_start = False + doc.is_tagged = True + doc.is_parsed = True + # roundtrip to JSON with make_tempdir() as tmpdir: json_file = tmpdir / "roundtrip.json" srsly.write_json(json_file, [docs_to_json(doc)]) @@ -116,6 +125,52 @@ def test_roundtrip_docs_to_json(): assert len(doc) == goldcorpus.count_train() assert text == reloaded_doc.text + assert tags == goldparse.tags + assert deps == goldparse.labels + assert heads == goldparse.heads + assert biluo_tags == goldparse.ner + assert "TRAVEL" in goldparse.cats + assert "BAKING" in goldparse.cats + assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] + assert cats["BAKING"] == goldparse.cats["BAKING"] + + # roundtrip to JSONL train dicts + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "roundtrip.jsonl" + srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) + + assert len(doc) == goldcorpus.count_train() + assert text == reloaded_doc.text + assert tags == goldparse.tags + assert deps == goldparse.labels + assert heads == goldparse.heads + assert biluo_tags == goldparse.ner + assert "TRAVEL" in goldparse.cats + assert "BAKING" in goldparse.cats + assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] + assert cats["BAKING"] == goldparse.cats["BAKING"] + + # roundtrip to JSONL tuples + with make_tempdir() as tmpdir: + jsonl_file = tmpdir / "roundtrip.jsonl" + # write to JSONL train dicts + srsly.write_jsonl(jsonl_file, [docs_to_json(doc)]) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + # load and rewrite as JSONL tuples + srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples) + goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) + + reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp)) + + assert len(doc) == goldcorpus.count_train() + assert text == reloaded_doc.text + assert tags == goldparse.tags + assert deps == goldparse.labels + assert heads == goldparse.heads + assert biluo_tags == goldparse.ner assert "TRAVEL" in goldparse.cats assert "BAKING" in goldparse.cats assert cats["TRAVEL"] == goldparse.cats["TRAVEL"] diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index aa28a14d1..e41a07374 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -155,21 +155,14 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter] ### Output file types {new="2.1"} -> #### Which format should I choose? -> -> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means -> that there's one JSON object per line. Unlike a regular JSON file, it can also -> be read in line-by-line and you won't have to parse the _entire file_ first. -> This makes it a very convenient format for larger corpora. - All output files generated by this command are compatible with [`spacy train`](/api/cli#train). -| ID | Description | -| ------- | --------------------------------- | -| `jsonl` | Newline-delimited JSON (default). | -| `json` | Regular JSON. | -| `msg` | Binary MessagePack format. | +| ID | Description | +| ------- | -------------------------- | +| `json` | Regular JSON (default). | +| `jsonl` | Newline-delimited JSON. | +| `msg` | Binary MessagePack format. | ### Converter options