diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 8cd4347c2..81c882967 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -19,6 +19,15 @@ def doc(en_tokenizer): return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) +@pytest.fixture +def doc_not_parsed(en_tokenizer): + text = "This is a sentence. This is another sentence. And a third." + tokens = en_tokenizer(text) + d = get_doc(tokens.vocab, [t.text for t in tokens]) + d.is_parsed = False + return d + + def test_spans_sent_spans(doc): sents = list(doc.sents) assert sents[0].start == 0 @@ -34,6 +43,7 @@ def test_spans_root(doc): assert span.root.text == 'sentence' assert span.root.head.text == 'is' + def test_spans_string_fn(doc): span = doc[0:4] assert len(span) == 4 @@ -41,6 +51,7 @@ def test_spans_string_fn(doc): assert span.upper_ == 'THIS IS A SENTENCE' assert span.lower_ == 'this is a sentence' + def test_spans_root2(en_tokenizer): text = "through North and South Carolina" heads = [0, 3, -1, -2, -4] @@ -49,12 +60,17 @@ def test_spans_root2(en_tokenizer): assert doc[-2:].root.text == 'Carolina' -def test_spans_span_sent(doc): +def test_spans_span_sent(doc, doc_not_parsed): """Test span.sent property""" assert len(list(doc.sents)) assert doc[:2].sent.root.text == 'is' assert doc[:2].sent.text == 'This is a sentence .' assert doc[6:7].sent.root.left_edge.text == 'This' + # test on manual sbd + doc_not_parsed[0].is_sent_start = True + doc_not_parsed[5].is_sent_start = True + assert doc_not_parsed[1:3].sent == doc_not_parsed[0:5] + assert doc_not_parsed[10:14].sent == doc_not_parsed[5:] def test_spans_lca_matrix(en_tokenizer):