From 6ca6d7d6b4d2ea8fa596f3f7be4f244aa7902e15 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Jun 2020 20:01:02 +0200 Subject: [PATCH] test for split sentences with various alignment issues, works --- spacy/tests/test_gold.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 8e1399fd0..d98a93f2f 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -90,6 +90,7 @@ def merged_dict(): return { "ids": [1, 2, 3, 4, 5, 6, 7], "words": ["Hi", "there", "everyone", "It", "is", "just", "me"], + "spaces": [True, True, True, True, True, True, False], "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"], "sent_starts": [1, 0, 0, 1, 0, 0, 0], } @@ -150,6 +151,30 @@ def test_gold_biluo_misalign(en_vocab): assert tags == ["O", "O", "O", "-", "-", "-"] +def test_split_sentences(en_vocab): + words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"] + doc = Doc(en_vocab, words=words) + gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of", "fun"] + sent_starts = [True, False, False, False, False, False, True, False, False, False] + example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts}) + assert example.text == "I flew to San Francisco Valley had loads of fun " + split_examples = example.split_sents() + assert len(split_examples) == 2 + assert split_examples[0].text == "I flew to San Francisco Valley " + assert split_examples[1].text == "had loads of fun " + + words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"] + doc = Doc(en_vocab, words=words) + gold_words = ["I", "flew", "to", "San Francisco", "Valley", "had", "loads of", "fun"] + sent_starts = [True, False, False, False, False, True, False, False] + example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts}) + assert example.text == "I flew to San Francisco Valley had loads of fun " + split_examples = example.split_sents() + assert len(split_examples) == 2 + assert split_examples[0].text == "I flew to San Francisco Valley " + assert split_examples[1].text == "had loads of fun " + + def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): # one-to-many words = ["I", "flew to", "San Francisco Valley", "."] @@ -466,7 +491,7 @@ def _train(train_data): def test_split_sents(merged_dict): nlp = English() example = Example.from_dict( - Doc(nlp.vocab, words=merged_dict["words"]), + Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]), merged_dict ) assert len(get_parses_from_example( @@ -484,6 +509,8 @@ def test_split_sents(merged_dict): split_examples = example.split_sents() assert len(split_examples) == 2 + assert split_examples[0].text == "Hi there everyone " + assert split_examples[1].text == "It is just me" token_annotation_1 = split_examples[0].to_dict()["token_annotation"] assert token_annotation_1["words"] == ["Hi", "there", "everyone"]