* Fix error with merged text when merged region did not have trailing whitespace

This commit is contained in:
Matthew Honnibal 2015-10-19 15:47:04 +11:00
parent 0ce12e4548
commit 135062d23c
2 changed files with 14 additions and 1 deletions

View File

@ -448,7 +448,8 @@ cdef class Doc:
cdef Span span = self[start:end]
# Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span])
new_orth = new_orth[:-len(span[-1].whitespace_)]
if span[-1].whitespace_:
new_orth = new_orth[:-len(span[-1].whitespace_)]
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
# House the new merged token where it starts
cdef TokenC* token = &self.data[start]

View File

@ -124,6 +124,18 @@ def test_merge(EN):
assert doc[4].tag_ == 'NAMED'
def test_merge_end_string(EN):
doc = EN('WKRO played songs by the beach boys all night')
assert len(doc) == 9
# merge 'The Beach Boys'
doc.merge(doc[7].idx, doc[8].idx + len(doc[8]), 'NAMED', 'LEMMA', 'TYPE')
assert len(doc) == 8
assert doc[7].text == 'all night'
assert doc[7].text_with_ws == 'all night'
@pytest.mark.models
def test_merge_children(EN):
"""Test that attachments work correctly after merging."""