mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
* Fix error with merged text when merged region did not have trailing whitespace
This commit is contained in:
parent
0ce12e4548
commit
135062d23c
|
@ -448,7 +448,8 @@ cdef class Doc:
|
||||||
cdef Span span = self[start:end]
|
cdef Span span = self[start:end]
|
||||||
# Get LexemeC for newly merged token
|
# Get LexemeC for newly merged token
|
||||||
new_orth = ''.join([t.text_with_ws for t in span])
|
new_orth = ''.join([t.text_with_ws for t in span])
|
||||||
new_orth = new_orth[:-len(span[-1].whitespace_)]
|
if span[-1].whitespace_:
|
||||||
|
new_orth = new_orth[:-len(span[-1].whitespace_)]
|
||||||
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
cdef TokenC* token = &self.data[start]
|
cdef TokenC* token = &self.data[start]
|
||||||
|
|
|
@ -124,6 +124,18 @@ def test_merge(EN):
|
||||||
assert doc[4].tag_ == 'NAMED'
|
assert doc[4].tag_ == 'NAMED'
|
||||||
|
|
||||||
|
|
||||||
|
def test_merge_end_string(EN):
|
||||||
|
doc = EN('WKRO played songs by the beach boys all night')
|
||||||
|
|
||||||
|
assert len(doc) == 9
|
||||||
|
# merge 'The Beach Boys'
|
||||||
|
doc.merge(doc[7].idx, doc[8].idx + len(doc[8]), 'NAMED', 'LEMMA', 'TYPE')
|
||||||
|
assert len(doc) == 8
|
||||||
|
|
||||||
|
assert doc[7].text == 'all night'
|
||||||
|
assert doc[7].text_with_ws == 'all night'
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_merge_children(EN):
|
def test_merge_children(EN):
|
||||||
"""Test that attachments work correctly after merging."""
|
"""Test that attachments work correctly after merging."""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user