mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Skip duplicate spans in Doc.retokenize (#4339)
This commit is contained in:
parent
71bd040834
commit
f7d1736241
|
@ -414,3 +414,14 @@ def test_doc_retokenizer_merge_lex_attrs(en_vocab):
|
|||
assert doc[1].is_stop
|
||||
assert not doc[0].is_stop
|
||||
assert not doc[1].like_num
|
||||
|
||||
|
||||
def test_retokenize_skip_duplicates(en_vocab):
|
||||
"""Test that the retokenizer automatically skips duplicate spans instead
|
||||
of complaining about overlaps. See #3687."""
|
||||
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(doc[0:2])
|
||||
retokenizer.merge(doc[0:2])
|
||||
assert len(doc) == 2
|
||||
assert doc[0].text == "hello world"
|
||||
|
|
|
@ -35,12 +35,14 @@ cdef class Retokenizer:
|
|||
cdef list merges
|
||||
cdef list splits
|
||||
cdef set tokens_to_merge
|
||||
cdef list _spans_to_merge
|
||||
|
||||
def __init__(self, doc):
|
||||
self.doc = doc
|
||||
self.merges = []
|
||||
self.splits = []
|
||||
self.tokens_to_merge = set()
|
||||
self._spans_to_merge = [] # keep a record to filter out duplicates
|
||||
|
||||
def merge(self, Span span, attrs=SimpleFrozenDict()):
|
||||
"""Mark a span for merging. The attrs will be applied to the resulting
|
||||
|
@ -51,10 +53,13 @@ cdef class Retokenizer:
|
|||
|
||||
DOCS: https://spacy.io/api/doc#retokenizer.merge
|
||||
"""
|
||||
if (span.start, span.end) in self._spans_to_merge:
|
||||
return
|
||||
for token in span:
|
||||
if token.i in self.tokens_to_merge:
|
||||
raise ValueError(Errors.E102.format(token=repr(token)))
|
||||
self.tokens_to_merge.add(token.i)
|
||||
self._spans_to_merge.append((span.start, span.end))
|
||||
if "_" in attrs: # Extension attributes
|
||||
extensions = attrs["_"]
|
||||
_validate_extensions(extensions)
|
||||
|
|
Loading…
Reference in New Issue
Block a user