From c21efea9bb1d3e92ba16695190eae27e53e1d5e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ole=20Henrik=20Skogstr=C3=B8m?= Date: Fri, 6 Jul 2018 15:54:15 +0200 Subject: [PATCH] Add sent property to token (#2521) * Add sent property to token * Refactored and cleaned up copy paste errors. --- spacy/tests/doc/test_token_api.py | 16 ++++++++++++++++ spacy/tokens/token.pyx | 7 +++++++ 2 files changed, 23 insertions(+) diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index e81d5b7be..7ed27f989 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -56,6 +56,14 @@ def test_doc_token_api_str_builtin(en_tokenizer, text): assert str(tokens[0]) == text.split(' ')[0] assert str(tokens[1]) == text.split(' ')[1] +@pytest.fixture +def doc(en_tokenizer): + text = "This is a sentence. This is another sentence. And a third." + heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1] + deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det', + 'attr', 'punct', 'ROOT', 'det', 'npadvmod', 'punct'] + tokens = en_tokenizer(text) + return get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) def test_doc_token_api_is_properties(en_vocab): text = ["Hi", ",", "my", "email", "is", "test@me.com"] @@ -162,3 +170,11 @@ def test_is_sent_start(en_tokenizer): assert doc[5].is_sent_start is True doc.is_parsed = True assert len(list(doc.sents)) == 2 + +def test_tokens_sent(doc): + """Test token.sent property""" + assert len(list(doc.sents)) == 3 + assert doc[1].sent.text == 'This is a sentence .' + assert doc[7].sent.text == 'This is another sentence .' + assert doc[1].sent.root.left_edge.text == 'This' + assert doc[7].sent.root.left_edge.text == 'This' diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index a0f1bf9d3..bd8f29859 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -356,6 +356,13 @@ cdef class Token: def __get__(self): return self.c.r_kids + property sent: + """RETURNS (Span): The sentence span that the token is a part of.""" + def __get__(self): + if 'sent' in self.doc.user_token_hooks: + return self.doc.user_token_hooks['sent'](self) + return self.doc[self.i : self.i+1].sent + property sent_start: def __get__(self): # Raising a deprecation warning here causes errors for autocomplete