mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Support train dict format as JSONL (#4471)
* Support train dict format as JSONL * Add (overly simple) check for dict vs. tuple to read JSONL lines as either train dicts or train tuples * Extend JSON/JSONL roundtrip conversion tests using `docs_to_json()` and `GoldCorpus.train_tuples` * Revert docs to default JSON output with convert
This commit is contained in:
		
							parent
							
								
									7fc39f124c
								
							
						
					
					
						commit
						8516e9d53b
					
				| 
						 | 
					@ -176,6 +176,11 @@ class GoldCorpus(object):
 | 
				
			||||||
                gold_tuples = read_json_file(loc)
 | 
					                gold_tuples = read_json_file(loc)
 | 
				
			||||||
            elif loc.parts[-1].endswith("jsonl"):
 | 
					            elif loc.parts[-1].endswith("jsonl"):
 | 
				
			||||||
                gold_tuples = srsly.read_jsonl(loc)
 | 
					                gold_tuples = srsly.read_jsonl(loc)
 | 
				
			||||||
 | 
					                first_gold_tuple = next(gold_tuples)
 | 
				
			||||||
 | 
					                gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
 | 
				
			||||||
 | 
					                # TODO: proper format checks with schemas
 | 
				
			||||||
 | 
					                if isinstance(first_gold_tuple, dict):
 | 
				
			||||||
 | 
					                    gold_tuples = read_json_object(gold_tuples)
 | 
				
			||||||
            elif loc.parts[-1].endswith("msg"):
 | 
					            elif loc.parts[-1].endswith("msg"):
 | 
				
			||||||
                gold_tuples = srsly.read_msgpack(loc)
 | 
					                gold_tuples = srsly.read_msgpack(loc)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -99,14 +99,23 @@ def test_iob_to_biluo():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_roundtrip_docs_to_json():
 | 
					def test_roundtrip_docs_to_json():
 | 
				
			||||||
    text = "I flew to Silicon Valley via London."
 | 
					    text = "I flew to Silicon Valley via London."
 | 
				
			||||||
 | 
					    tags = ['PRP', 'VBD', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
 | 
				
			||||||
 | 
					    heads = [1, 1, 1, 4, 2, 1, 5, 1]
 | 
				
			||||||
 | 
					    deps = ['nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
 | 
				
			||||||
 | 
					    biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
 | 
				
			||||||
    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
 | 
					    cats = {"TRAVEL": 1.0, "BAKING": 0.0}
 | 
				
			||||||
    nlp = English()
 | 
					    nlp = English()
 | 
				
			||||||
    doc = nlp(text)
 | 
					    doc = nlp(text)
 | 
				
			||||||
 | 
					    for i in range(len(tags)):
 | 
				
			||||||
 | 
					        doc[i].tag_ = tags[i]
 | 
				
			||||||
 | 
					        doc[i].dep_ = deps[i]
 | 
				
			||||||
 | 
					        doc[i].head = doc[heads[i]]
 | 
				
			||||||
 | 
					    doc.ents = spans_from_biluo_tags(doc, biluo_tags)
 | 
				
			||||||
    doc.cats = cats
 | 
					    doc.cats = cats
 | 
				
			||||||
    doc[0].is_sent_start = True
 | 
					    doc.is_tagged = True
 | 
				
			||||||
    for i in range(1, len(doc)):
 | 
					    doc.is_parsed = True
 | 
				
			||||||
        doc[i].is_sent_start = False
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # roundtrip to JSON
 | 
				
			||||||
    with make_tempdir() as tmpdir:
 | 
					    with make_tempdir() as tmpdir:
 | 
				
			||||||
        json_file = tmpdir / "roundtrip.json"
 | 
					        json_file = tmpdir / "roundtrip.json"
 | 
				
			||||||
        srsly.write_json(json_file, [docs_to_json(doc)])
 | 
					        srsly.write_json(json_file, [docs_to_json(doc)])
 | 
				
			||||||
| 
						 | 
					@ -116,6 +125,52 @@ def test_roundtrip_docs_to_json():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert len(doc) == goldcorpus.count_train()
 | 
					    assert len(doc) == goldcorpus.count_train()
 | 
				
			||||||
    assert text == reloaded_doc.text
 | 
					    assert text == reloaded_doc.text
 | 
				
			||||||
 | 
					    assert tags == goldparse.tags
 | 
				
			||||||
 | 
					    assert deps == goldparse.labels
 | 
				
			||||||
 | 
					    assert heads == goldparse.heads
 | 
				
			||||||
 | 
					    assert biluo_tags == goldparse.ner
 | 
				
			||||||
 | 
					    assert "TRAVEL" in goldparse.cats
 | 
				
			||||||
 | 
					    assert "BAKING" in goldparse.cats
 | 
				
			||||||
 | 
					    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
 | 
				
			||||||
 | 
					    assert cats["BAKING"] == goldparse.cats["BAKING"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # roundtrip to JSONL train dicts
 | 
				
			||||||
 | 
					    with make_tempdir() as tmpdir:
 | 
				
			||||||
 | 
					        jsonl_file = tmpdir / "roundtrip.jsonl"
 | 
				
			||||||
 | 
					        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
 | 
				
			||||||
 | 
					        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert len(doc) == goldcorpus.count_train()
 | 
				
			||||||
 | 
					    assert text == reloaded_doc.text
 | 
				
			||||||
 | 
					    assert tags == goldparse.tags
 | 
				
			||||||
 | 
					    assert deps == goldparse.labels
 | 
				
			||||||
 | 
					    assert heads == goldparse.heads
 | 
				
			||||||
 | 
					    assert biluo_tags == goldparse.ner
 | 
				
			||||||
 | 
					    assert "TRAVEL" in goldparse.cats
 | 
				
			||||||
 | 
					    assert "BAKING" in goldparse.cats
 | 
				
			||||||
 | 
					    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
 | 
				
			||||||
 | 
					    assert cats["BAKING"] == goldparse.cats["BAKING"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # roundtrip to JSONL tuples
 | 
				
			||||||
 | 
					    with make_tempdir() as tmpdir:
 | 
				
			||||||
 | 
					        jsonl_file = tmpdir / "roundtrip.jsonl"
 | 
				
			||||||
 | 
					        # write to JSONL train dicts
 | 
				
			||||||
 | 
					        srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
 | 
				
			||||||
 | 
					        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
 | 
				
			||||||
 | 
					        # load and rewrite as JSONL tuples
 | 
				
			||||||
 | 
					        srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples)
 | 
				
			||||||
 | 
					        goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert len(doc) == goldcorpus.count_train()
 | 
				
			||||||
 | 
					    assert text == reloaded_doc.text
 | 
				
			||||||
 | 
					    assert tags == goldparse.tags
 | 
				
			||||||
 | 
					    assert deps == goldparse.labels
 | 
				
			||||||
 | 
					    assert heads == goldparse.heads
 | 
				
			||||||
 | 
					    assert biluo_tags == goldparse.ner
 | 
				
			||||||
    assert "TRAVEL" in goldparse.cats
 | 
					    assert "TRAVEL" in goldparse.cats
 | 
				
			||||||
    assert "BAKING" in goldparse.cats
 | 
					    assert "BAKING" in goldparse.cats
 | 
				
			||||||
    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
 | 
					    assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -155,20 +155,13 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Output file types {new="2.1"}
 | 
					### Output file types {new="2.1"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Which format should I choose?
 | 
					 | 
				
			||||||
>
 | 
					 | 
				
			||||||
> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means
 | 
					 | 
				
			||||||
> that there's one JSON object per line. Unlike a regular JSON file, it can also
 | 
					 | 
				
			||||||
> be read in line-by-line and you won't have to parse the _entire file_ first.
 | 
					 | 
				
			||||||
> This makes it a very convenient format for larger corpora.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
All output files generated by this command are compatible with
 | 
					All output files generated by this command are compatible with
 | 
				
			||||||
[`spacy train`](/api/cli#train).
 | 
					[`spacy train`](/api/cli#train).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| ID      | Description                |
 | 
					| ID      | Description                |
 | 
				
			||||||
| ------- | --------------------------------- |
 | 
					| ------- | -------------------------- |
 | 
				
			||||||
| `jsonl` | Newline-delimited JSON (default). |
 | 
					| `json`  | Regular JSON (default).    |
 | 
				
			||||||
| `json`  | Regular JSON.                     |
 | 
					| `jsonl` | Newline-delimited JSON.    |
 | 
				
			||||||
| `msg`   | Binary MessagePack format. |
 | 
					| `msg`   | Binary MessagePack format. |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### Converter options
 | 
					### Converter options
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user