diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f1c8d2c71..d11054e35 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -441,6 +441,7 @@ cdef class Doc: def token_index_start(self, int start_idx): + """ Get index of token in doc that has character index start_idx """ cdef int i for i in range(self.length): if self.data[i].idx == start_idx: @@ -448,6 +449,7 @@ cdef class Doc: return None def token_index_end(self, int end_idx): + """ Get index+1 of token in doc ending with character index end_idx """ cdef int i for i in range(self.length): if (self.data[i].idx + self.data[i].lex.length) == end_idx: @@ -455,6 +457,8 @@ cdef class Doc: return None def range_from_indices(self, int start_idx, int end_idx): + """ Get tuple - span of token indices which correspond to + character indices (start_idx, end_idx) if such a span exists""" assert start_idx < end_idx cdef int i cdef int start = -1 diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index f4dcb15f0..afd809ecf 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -14,14 +14,15 @@ from ..util import normalize_slice cdef class Span: - """A slice from a Doc object.""" + """A slice from a Doc object. Internally keeps character offsets in order + to keep track of changes (merges) in the original Doc. Updates are + made in start and end property.""" def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None, vector_norm=None): if not (0 <= start <= end <= len(tokens)): raise IndexError self.doc = tokens - # keep char offsets - as these don't change when merging spans self.start_token = start self.start_idx = self.doc[start].idx self.end_token = end @@ -80,9 +81,14 @@ cdef class Span: return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property start: + """ Get start token index of this span from the Doc.""" def __get__(self): - # if we haven't merged anything below check is false - so we get start token + # if we have merged spans in Doc start might have changed. + # check if token start index is in doc index range and the token + # index is start_idx (it hasn't changed). + # Potential IndexError if only second condition was used if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx: + # go through tokens in Doc - find index of token equal to start_idx new_start = self.doc.token_index_start(self.start_idx) if new_start is not None: self.start_token = new_start @@ -92,9 +98,14 @@ cdef class Span: return self.start_token property end: + """ Get end token index of this span from the Doc.""" def __get__(self): - # if we haven't merged anything we have fast access + # if we have merged spans in Doc end will have changed. + # check if token end index is in doc index range and the token + # index is end_idx (it hasn't changed). + # Potential IndexError if only second condition was used if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx: + # go through tokens in Doc - find index of token equal to end_idx new_end = self.doc.token_index_end(self.end_idx) if new_end is not None: self.end_token = new_end