* span start, end -> properties. autoupdate after merge

This commit is contained in:
Andreas Grivas 2015-11-05 17:38:38 +02:00 committed by Matthew Honnibal
parent 562db6d2d0
commit 4be7fda453
2 changed files with 48 additions and 4 deletions

View File

@ -4,8 +4,10 @@ from .doc cimport Doc
cdef class Span: cdef class Span:
cdef readonly Doc doc cdef readonly Doc doc
cdef public int i cdef public int i
cdef public int start cdef public int start_token
cdef public int end cdef public int end_token
cdef public int start_idx
cdef public int end_idx
cdef readonly int label cdef readonly int label
cdef public _vector cdef public _vector

View File

@ -21,8 +21,10 @@ cdef class Span:
raise IndexError raise IndexError
self.doc = tokens self.doc = tokens
self.start = start self.start_token = start
self.end = end self.start_idx = self.doc[start].idx
self.end_token = end
self.end_idx = self.doc[end - 1].idx + len(self.doc[end - 1])
self.label = label self.label = label
self._vector = vector self._vector = vector
self._vector_norm = vector_norm self._vector_norm = vector_norm
@ -76,6 +78,46 @@ cdef class Span:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property start:
""" Get start token index of this span from the Doc."""
def __get__(self):
# first is the first token of the span - get it from the doc
first = None
if self.start_token < len(self.doc):
first = self.doc[self.start_token]
# if we have merged spans in Doc start might have changed.
# check if token start index is in doc index range and the token
# index is start_idx (it hasn't changed).
if first is None or first.idx != self.start_idx:
# go through tokens in Doc - find index of token equal to start_idx
new_start = self.doc.token_index_start(self.start_idx)
if new_start is not None:
self.start_token = new_start
else:
raise IndexError('Something went terribly wrong during a merge.'
'No token found with idx %s' % self.start_idx)
return self.start_token
property end:
""" Get end token index of this span from the Doc."""
def __get__(self):
# last is the last token of the span - get it from the doc
last = None
if self.end_token <= len(self.doc):
last = self.doc[self.end_token -1]
# if we have merged spans in Doc end will have changed.
# check if token end index is in doc index range and the token
# index is end_idx + len(last_token) (it hasn't changed).
if last is None or last.idx + len(last) != self.end_idx:
# go through tokens in Doc - find index of token equal to end_idx
new_end = self.doc.token_index_end(self.end_idx)
if new_end is not None:
self.end_token = new_end
else:
raise IndexError('Something went terribly wrong during a merge.'
'No token found with idx %s' % self.end_idx)
return self.end_token
property vector: property vector:
def __get__(self): def __get__(self):
if self._vector is None: if self._vector is None: