mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 12:06:25 +03:00
Fix offset and whitespace in Matcher special cases
* Fix offset bugs when merging and splitting tokens * Set final whitespace on final token in inserted special case
This commit is contained in:
parent
11ba042aca
commit
ae52c5eb52
|
@ -220,9 +220,11 @@ cdef class Tokenizer:
|
||||||
spans = [(span.text, span.start, span.end) for span in spans]
|
spans = [(span.text, span.start, span.end) for span in spans]
|
||||||
# Modify tokenization according to filtered special cases
|
# Modify tokenization according to filtered special cases
|
||||||
cdef int offset = 0
|
cdef int offset = 0
|
||||||
cdef int span_length_diff
|
cdef int span_length_diff = 0
|
||||||
cdef int idx_offset
|
cdef int idx_offset = 0
|
||||||
for span in spans:
|
for span in spans:
|
||||||
|
if not span[0] in self._rules:
|
||||||
|
continue
|
||||||
# Allocate more memory for doc if needed
|
# Allocate more memory for doc if needed
|
||||||
span_length_diff = len(self._rules[span[0]]) - (span[2] - span[1])
|
span_length_diff = len(self._rules[span[0]]) - (span[2] - span[1])
|
||||||
while doc.length + offset + span_length_diff >= doc.max_length:
|
while doc.length + offset + span_length_diff >= doc.max_length:
|
||||||
|
@ -234,23 +236,26 @@ cdef class Tokenizer:
|
||||||
# Shift original tokens...
|
# Shift original tokens...
|
||||||
# ...from span position to end if new span is shorter
|
# ...from span position to end if new span is shorter
|
||||||
if span_length_diff < 0:
|
if span_length_diff < 0:
|
||||||
for i in range(span[2] + offset, doc.length + offset):
|
for i in range(span[2] + offset, doc.length):
|
||||||
doc.c[span_length_diff + i] = doc.c[i]
|
doc.c[span_length_diff + i] = doc.c[i]
|
||||||
# ...from end to span position if new span is longer
|
# ...from end to span position if new span is longer
|
||||||
elif span_length_diff > 0:
|
elif span_length_diff > 0:
|
||||||
for i in range(doc.length + offset - 1, span[2] + offset - 1, -1):
|
for i in range(doc.length - 1, span[2] + offset - 1, -1):
|
||||||
doc.c[span_length_diff + i] = doc.c[i]
|
doc.c[span_length_diff + i] = doc.c[i]
|
||||||
# Copy special case tokens into doc and adjust token and character
|
# Copy special case tokens into doc and adjust token and character
|
||||||
# offsets
|
# offsets
|
||||||
idx_offset = 0
|
idx_offset = 0
|
||||||
|
orig_final_spacy = doc.c[span[2] + offset - 1].spacy
|
||||||
for i in range(cached.length):
|
for i in range(cached.length):
|
||||||
orig_idx = doc.c[span[1] + offset + i].idx
|
orig_idx = doc.c[span[1] + offset + i].idx
|
||||||
doc.c[span[1] + offset + i] = cached.data.tokens[i]
|
doc.c[span[1] + offset + i] = cached.data.tokens[i]
|
||||||
doc.c[span[1] + offset + i].idx = orig_idx + idx_offset
|
doc.c[span[1] + offset + i].idx = orig_idx + idx_offset
|
||||||
idx_offset += cached.data.tokens[i].lex.length
|
idx_offset += cached.data.tokens[i].lex.length + \
|
||||||
|
1 if cached.data.tokens[i].spacy else 0
|
||||||
|
doc.c[span[2] + offset + - 1].spacy = orig_final_spacy
|
||||||
# Token offset for special case spans
|
# Token offset for special case spans
|
||||||
offset += span_length_diff
|
offset += span_length_diff
|
||||||
doc.length += offset
|
doc.length += span_length_diff
|
||||||
return True
|
return True
|
||||||
|
|
||||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user