diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index 54c0a3afb..bae9e4691 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -4,8 +4,10 @@ from .doc cimport Doc cdef class Span: cdef readonly Doc doc cdef public int i - cdef public int start - cdef public int end + cdef public int start_token + cdef public int end_token + cdef public int start_idx + cdef public int end_idx cdef readonly int label cdef public _vector diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 7b74be492..83129b0a3 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -21,8 +21,10 @@ cdef class Span: raise IndexError self.doc = tokens - self.start = start - self.end = end + self.start_token = start + self.start_idx = self.doc[start].idx + self.end_token = end + self.end_idx = self.doc[end - 1].idx + len(self.doc[end - 1]) self.label = label self._vector = vector self._vector_norm = vector_norm @@ -76,6 +78,46 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + property start: + """ Get start token index of this span from the Doc.""" + def __get__(self): + # first is the first token of the span - get it from the doc + first = None + if self.start_token < len(self.doc): + first = self.doc[self.start_token] + # if we have merged spans in Doc start might have changed. + # check if token start index is in doc index range and the token + # index is start_idx (it hasn't changed). + if first is None or first.idx != self.start_idx: + # go through tokens in Doc - find index of token equal to start_idx + new_start = self.doc.token_index_start(self.start_idx) + if new_start is not None: + self.start_token = new_start + else: + raise IndexError('Something went terribly wrong during a merge.' + 'No token found with idx %s' % self.start_idx) + return self.start_token + + property end: + """ Get end token index of this span from the Doc.""" + def __get__(self): + # last is the last token of the span - get it from the doc + last = None + if self.end_token <= len(self.doc): + last = self.doc[self.end_token -1] + # if we have merged spans in Doc end will have changed. + # check if token end index is in doc index range and the token + # index is end_idx + len(last_token) (it hasn't changed). + if last is None or last.idx + len(last) != self.end_idx: + # go through tokens in Doc - find index of token equal to end_idx + new_end = self.doc.token_index_end(self.end_idx) + if new_end is not None: + self.end_token = new_end + else: + raise IndexError('Something went terribly wrong during a merge.' + 'No token found with idx %s' % self.end_idx) + return self.end_token + property vector: def __get__(self): if self._vector is None: