Formatting and whitespace [ci skip]

This commit is contained in:
Ines Montani 2019-02-14 18:05:07 +01:00
parent e104e47c21
commit 2569339a98

View File

@ -21,6 +21,7 @@ from ..attrs import intify_attrs
from ..util import SimpleFrozenDict from ..util import SimpleFrozenDict
from ..errors import Errors from ..errors import Errors
cdef class Retokenizer: cdef class Retokenizer:
"""Helper class for doc.retokenize() context manager.""" """Helper class for doc.retokenize() context manager."""
cdef Doc doc cdef Doc doc
@ -174,25 +175,21 @@ def _bulk_merge(Doc doc, merges):
def _get_start(merge): def _get_start(merge):
return merge[0].start return merge[0].start
merges.sort(key=_get_start)
merges.sort(key=_get_start)
for merge_index, (span, attributes) in enumerate(merges): for merge_index, (span, attributes) in enumerate(merges):
start = span.start start = span.start
end = span.end end = span.end
spans.append(span) spans.append(span)
# House the new merged token where it starts # House the new merged token where it starts
token = &doc.c[start] token = &doc.c[start]
tokens[merge_index] = token tokens[merge_index] = token
# Assign attributes # Assign attributes
for attr_name, attr_value in attributes.items(): for attr_name, attr_value in attributes.items():
if attr_name == TAG: if attr_name == TAG:
doc.vocab.morphology.assign_tag(token, attr_value) doc.vocab.morphology.assign_tag(token, attr_value)
else: else:
Token.set_struct_attr(token, attr_name, attr_value) Token.set_struct_attr(token, attr_name, attr_value)
# Resize the doc.tensor, if it's set. Let the last row for each token stand # Resize the doc.tensor, if it's set. Let the last row for each token stand
# for the merged region. To do this, we create a boolean array indicating # for the merged region. To do this, we create a boolean array indicating
# whether the row is to be deleted, then use numpy.delete # whether the row is to be deleted, then use numpy.delete
@ -205,7 +202,6 @@ def _bulk_merge(Doc doc, merges):
for i, span in enumerate(spans): for i, span in enumerate(spans):
span_roots.append(span.root.i) span_roots.append(span.root.i)
tokens[i].dep = span.root.dep tokens[i].dep = span.root.dep
# We update token.lex after keeping span root and dep, since # We update token.lex after keeping span root and dep, since
# setting token.lex will change span.start and span.end properties # setting token.lex will change span.start and span.end properties
# as it modifies the character offsets in the doc # as it modifies the character offsets in the doc
@ -217,7 +213,6 @@ def _bulk_merge(Doc doc, merges):
tokens[token_index].lex = lex tokens[token_index].lex = lex
# We set trailing space here too # We set trailing space here too
tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy tokens[token_index].spacy = doc.c[spans[token_index].end-1].spacy
# Begin by setting all the head indices to absolute token positions # Begin by setting all the head indices to absolute token positions
# This is easier to work with for now than the offsets # This is easier to work with for now than the offsets
# Before thinking of something simpler, beware the case where a # Before thinking of something simpler, beware the case where a
@ -225,11 +220,9 @@ def _bulk_merge(Doc doc, merges):
# tokens changes. # tokens changes.
for i in range(doc.length): for i in range(doc.length):
doc.c[i].head += i doc.c[i].head += i
# Set the head of the merged token from the Span # Set the head of the merged token from the Span
for i in range(len(merges)): for i in range(len(merges)):
tokens[i].head = doc.c[span_roots[i]].head tokens[i].head = doc.c[span_roots[i]].head
# Adjust deps before shrinking tokens # Adjust deps before shrinking tokens
# Tokens which point into the merged token should now point to it # Tokens which point into the merged token should now point to it
# Subtract the offset from all tokens which point to >= end # Subtract the offset from all tokens which point to >= end
@ -241,16 +234,13 @@ def _bulk_merge(Doc doc, merges):
#last token was the last of the span #last token was the last of the span
current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1 current_offset += (spans[current_span_index].end - spans[current_span_index].start) -1
current_span_index += 1 current_span_index += 1
if current_span_index < len(spans) and \ if current_span_index < len(spans) and \
spans[current_span_index].start <= i < spans[current_span_index].end: spans[current_span_index].start <= i < spans[current_span_index].end:
offsets.append(spans[current_span_index].start - current_offset) offsets.append(spans[current_span_index].start - current_offset)
else: else:
offsets.append(i - current_offset) offsets.append(i - current_offset)
for i in range(doc.length): for i in range(doc.length):
doc.c[i].head = offsets[doc.c[i].head] doc.c[i].head = offsets[doc.c[i].head]
# Now compress the token array # Now compress the token array
offset = 0 offset = 0
in_span = False in_span = False
@ -272,14 +262,11 @@ def _bulk_merge(Doc doc, merges):
memset(&doc.c[i], 0, sizeof(TokenC)) memset(&doc.c[i], 0, sizeof(TokenC))
doc.c[i].lex = &EMPTY_LEXEME doc.c[i].lex = &EMPTY_LEXEME
doc.length -= offset doc.length -= offset
# ...And, set heads back to a relative position # ...And, set heads back to a relative position
for i in range(doc.length): for i in range(doc.length):
doc.c[i].head -= i doc.c[i].head -= i
# Set the left/right children, left/right edges # Set the left/right children, left/right edges
set_children_from_heads(doc.c, doc.length) set_children_from_heads(doc.c, doc.length)
# Make sure ent_iob remains consistent # Make sure ent_iob remains consistent
for (span, _) in merges: for (span, _) in merges:
if(span.end < len(offsets)): if(span.end < len(offsets)):
@ -329,13 +316,10 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
token_head_index = index token_head_index = index
if token_head_index == -1: if token_head_index == -1:
raise ValueError(Errors.E113) raise ValueError(Errors.E113)
# First, make the dependencies absolutes, and adjust all possible dependencies before # First, make the dependencies absolutes, and adjust all possible dependencies before
# creating the tokens # creating the tokens
for i in range(doc.length): for i in range(doc.length):
doc.c[i].head += i doc.c[i].head += i
# Adjust dependencies # Adjust dependencies
offset = nb_subtokens - 1 offset = nb_subtokens - 1
for i in range(doc.length): for i in range(doc.length):
@ -344,22 +328,17 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
doc.c[i].head = token_head_index doc.c[i].head = token_head_index
elif head_idx > token_index: elif head_idx > token_index:
doc.c[i].head += offset doc.c[i].head += offset
new_token_head = doc.c[token_index].head new_token_head = doc.c[token_index].head
# Double doc.c max_length if necessary (until big enough for all new tokens) # Double doc.c max_length if necessary (until big enough for all new tokens)
while doc.length + nb_subtokens - 1 >= doc.max_length: while doc.length + nb_subtokens - 1 >= doc.max_length:
doc._realloc(doc.length * 2) doc._realloc(doc.length * 2)
# Move tokens after the split to create space for the new tokens # Move tokens after the split to create space for the new tokens
doc.length = len(doc) + nb_subtokens -1 doc.length = len(doc) + nb_subtokens -1
for token_to_move in range(doc.length - 1, token_index, -1): for token_to_move in range(doc.length - 1, token_index, -1):
doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move] doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
# Host the tokens in the newly created space # Host the tokens in the newly created space
cdef int idx_offset = 0 cdef int idx_offset = 0
for i, orth in enumerate(orths): for i, orth in enumerate(orths):
token = &doc.c[token_index + i] token = &doc.c[token_index + i]
lex = doc.vocab.get(doc.mem, orth) lex = doc.vocab.get(doc.mem, orth)
token.lex = lex token.lex = lex
@ -367,21 +346,18 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
if i != 0: if i != 0:
token.idx = orig_token.idx + idx_offset token.idx = orig_token.idx + idx_offset
idx_offset += len(orth) idx_offset += len(orth)
# Set token.spacy to False for all non-last split tokens, and # Set token.spacy to False for all non-last split tokens, and
# to origToken.spacy for the last token # to origToken.spacy for the last token
if (i < nb_subtokens - 1): if (i < nb_subtokens - 1):
token.spacy = False token.spacy = False
else: else:
token.spacy = orig_token.spacy token.spacy = orig_token.spacy
# Apply attrs to each subtoken # Apply attrs to each subtoken
for attr_name, attr_value in attrs.items(): for attr_name, attr_value in attrs.items():
if attr_name == TAG: if attr_name == TAG:
doc.vocab.morphology.assign_tag(token, attr_value) doc.vocab.morphology.assign_tag(token, attr_value)
else: else:
Token.set_struct_attr(token, attr_name, attr_value) Token.set_struct_attr(token, attr_name, attr_value)
# Make IOB consistent # Make IOB consistent
if (orig_token.ent_iob == 3): if (orig_token.ent_iob == 3):
if i == 0: if i == 0:
@ -391,22 +367,17 @@ def _split(Doc doc, int token_index, orths, heads, deps, attrs):
else: else:
# In all other cases subtokens inherit iob from origToken # In all other cases subtokens inherit iob from origToken
token.ent_iob = orig_token.ent_iob token.ent_iob = orig_token.ent_iob
# Use the head of the new token everywhere. This will be partially overwritten later on. # Use the head of the new token everywhere. This will be partially overwritten later on.
token.head = new_token_head token.head = new_token_head
# Transform the dependencies into relative ones again # Transform the dependencies into relative ones again
for i in range(doc.length): for i in range(doc.length):
doc.c[i].head -= i doc.c[i].head -= i
# Assign correct dependencies to the inner token # Assign correct dependencies to the inner token
for i, head in enumerate(heads): for i, head in enumerate(heads):
if head != 0: if head != 0:
# the token's head's head is already correct # the token's head's head is already correct
doc.c[token_index + i].head = head doc.c[token_index + i].head = head
for i, dep in enumerate(deps): for i, dep in enumerate(deps):
doc[token_index + i].dep = dep doc[token_index + i].dep = dep
# set children from head # set children from head
set_children_from_heads(doc.c, doc.length) set_children_from_heads(doc.c, doc.length)