From 135062d23c94e5b9a6f68e7bfb280b1af8b570e7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 19 Oct 2015 15:47:04 +1100 Subject: [PATCH] * Fix error with merged text when merged region did not have trailing whitespace --- spacy/tokens/doc.pyx | 3 ++- tests/tokens/test_tokens_api.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 55a83913b..c0cc6803b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -448,7 +448,8 @@ cdef class Doc: cdef Span span = self[start:end] # Get LexemeC for newly merged token new_orth = ''.join([t.text_with_ws for t in span]) - new_orth = new_orth[:-len(span[-1].whitespace_)] + if span[-1].whitespace_: + new_orth = new_orth[:-len(span[-1].whitespace_)] cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) # House the new merged token where it starts cdef TokenC* token = &self.data[start] diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index b40513b02..794a29bcb 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -124,6 +124,18 @@ def test_merge(EN): assert doc[4].tag_ == 'NAMED' +def test_merge_end_string(EN): + doc = EN('WKRO played songs by the beach boys all night') + + assert len(doc) == 9 + # merge 'The Beach Boys' + doc.merge(doc[7].idx, doc[8].idx + len(doc[8]), 'NAMED', 'LEMMA', 'TYPE') + assert len(doc) == 8 + + assert doc[7].text == 'all night' + assert doc[7].text_with_ws == 'all night' + + @pytest.mark.models def test_merge_children(EN): """Test that attachments work correctly after merging."""