mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* Fix error with merged text when merged region did not have trailing whitespace
This commit is contained in:
parent
0ce12e4548
commit
135062d23c
|
@ -448,7 +448,8 @@ cdef class Doc:
|
|||
cdef Span span = self[start:end]
|
||||
# Get LexemeC for newly merged token
|
||||
new_orth = ''.join([t.text_with_ws for t in span])
|
||||
new_orth = new_orth[:-len(span[-1].whitespace_)]
|
||||
if span[-1].whitespace_:
|
||||
new_orth = new_orth[:-len(span[-1].whitespace_)]
|
||||
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
||||
# House the new merged token where it starts
|
||||
cdef TokenC* token = &self.data[start]
|
||||
|
|
|
@ -124,6 +124,18 @@ def test_merge(EN):
|
|||
assert doc[4].tag_ == 'NAMED'
|
||||
|
||||
|
||||
def test_merge_end_string(EN):
|
||||
doc = EN('WKRO played songs by the beach boys all night')
|
||||
|
||||
assert len(doc) == 9
|
||||
# merge 'The Beach Boys'
|
||||
doc.merge(doc[7].idx, doc[8].idx + len(doc[8]), 'NAMED', 'LEMMA', 'TYPE')
|
||||
assert len(doc) == 8
|
||||
|
||||
assert doc[7].text == 'all night'
|
||||
assert doc[7].text_with_ws == 'all night'
|
||||
|
||||
|
||||
@pytest.mark.models
|
||||
def test_merge_children(EN):
|
||||
"""Test that attachments work correctly after merging."""
|
||||
|
|
Loading…
Reference in New Issue
Block a user