* merge add lex last - add index finder funcs

This commit is contained in:
Andreas Grivas 2015-11-05 17:28:08 +02:00 committed by Matthew Honnibal
parent a06e3c8963
commit 562db6d2d0

View File

@ -438,11 +438,25 @@ cdef class Doc:
keep_reading = False keep_reading = False
yield n_bytes_str + data yield n_bytes_str + data
# This function is terrible --- need to fix this. def token_index_start(self, int start_idx):
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, """ Get index of token in doc that has character index start_idx """
unicode ent_type): cdef int i
"""Merge a multi-word expression into a single token. Currently for i in range(self.length):
experimental; API is likely to change.""" if self.c[i].idx == start_idx:
return i
return None
def token_index_end(self, int end_idx):
""" Get index+1 of token in doc ending with character index end_idx """
cdef int i
for i in range(self.length):
if (self.c[i].idx + self.c[i].lex.length) == end_idx:
return i + 1
return None
def range_from_indices(self, int start_idx, int end_idx):
""" Get tuple - span of token indices which correspond to
character indices (start_idx, end_idx) if such a span exists"""
cdef int i cdef int i
cdef int start = -1 cdef int start = -1
cdef int end = -1 cdef int end = -1
@ -453,10 +467,18 @@ cdef class Doc:
if start == -1: if start == -1:
return None return None
end = i + 1 end = i + 1
break return (start, end)
else: return None
return None
# This function is terrible --- need to fix this.
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
unicode ent_type):
"""Merge a multi-word expression into a single token. Currently
experimental; API is likely to change."""
start_end = self.range_from_indices(start_idx, end_idx)
if start_end is None:
return None
start, end = start_end
cdef Span span = self[start:end] cdef Span span = self[start:end]
# Get LexemeC for newly merged token # Get LexemeC for newly merged token
new_orth = ''.join([t.text_with_ws for t in span]) new_orth = ''.join([t.text_with_ws for t in span])
@ -465,8 +487,6 @@ cdef class Doc:
cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
# House the new merged token where it starts # House the new merged token where it starts
cdef TokenC* token = &self.c[start] cdef TokenC* token = &self.c[start]
# Update fields
token.lex = lex
token.spacy = self.c[end-1].spacy token.spacy = self.c[end-1].spacy
if tag in self.vocab.morphology.tag_map: if tag in self.vocab.morphology.tag_map:
self.vocab.morphology.assign_tag(token, tag) self.vocab.morphology.assign_tag(token, tag)
@ -485,6 +505,10 @@ cdef class Doc:
# bridges over the entity. Here the alignment of the tokens changes. # bridges over the entity. Here the alignment of the tokens changes.
span_root = span.root.i span_root = span.root.i
token.dep = span.root.dep token.dep = span.root.dep
# We update token.lex after keeping span root and dep, since
# setting token.lex will change span.start and span.end properties
# as it modifies the character offsets in the doc
token.lex = lex
for i in range(self.length): for i in range(self.length):
self.c[i].head += i self.c[i].head += i
# Set the head of the merged token, and its dep relation, from the Span # Set the head of the merged token, and its dep relation, from the Span