mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Fix error with merged text when merged region did not have trailing whitespace
This commit is contained in:
parent
0ce12e4548
commit
135062d23c
|
@ -448,6 +448,7 @@ cdef class Doc:
|
||||||
cdef Span span = self[start:end]
|
cdef Span span = self[start:end]
|
||||||
# Get LexemeC for newly merged token
|
# Get LexemeC for newly merged token
|
||||||
new_orth = ''.join([t.text_with_ws for t in span])
|
new_orth = ''.join([t.text_with_ws for t in span])
|
||||||
|
if span[-1].whitespace_:
|
||||||
new_orth = new_orth[:-len(span[-1].whitespace_)]
|
new_orth = new_orth[:-len(span[-1].whitespace_)]
|
||||||
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
|
||||||
# House the new merged token where it starts
|
# House the new merged token where it starts
|
||||||
|
|
|
@ -124,6 +124,18 @@ def test_merge(EN):
|
||||||
assert doc[4].tag_ == 'NAMED'
|
assert doc[4].tag_ == 'NAMED'
|
||||||
|
|
||||||
|
|
||||||
|
def test_merge_end_string(EN):
|
||||||
|
doc = EN('WKRO played songs by the beach boys all night')
|
||||||
|
|
||||||
|
assert len(doc) == 9
|
||||||
|
# merge 'The Beach Boys'
|
||||||
|
doc.merge(doc[7].idx, doc[8].idx + len(doc[8]), 'NAMED', 'LEMMA', 'TYPE')
|
||||||
|
assert len(doc) == 8
|
||||||
|
|
||||||
|
assert doc[7].text == 'all night'
|
||||||
|
assert doc[7].text_with_ws == 'all night'
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_merge_children(EN):
|
def test_merge_children(EN):
|
||||||
"""Test that attachments work correctly after merging."""
|
"""Test that attachments work correctly after merging."""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user