added comments

2025-09-21 03:22:37 +03:00 · 2015-11-04 12:56:07 +02:00 · 2015-11-04 12:56:07 +02:00 · 015a84a5ec
commit 015a84a5ec
parent 91d74ed8ee
2 changed files with 19 additions and 4 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -441,6 +441,7 @@ cdef class Doc:


    def token_index_start(self, int start_idx):
+        """ Get index of token in doc that has character index start_idx """
        cdef int i
        for i in range(self.length):
            if self.data[i].idx == start_idx:
@ -448,6 +449,7 @@ cdef class Doc:
        return None

    def token_index_end(self, int end_idx):
+        """ Get index+1 of token in doc ending with character index end_idx """
        cdef int i
        for i in range(self.length):
            if (self.data[i].idx + self.data[i].lex.length) == end_idx:
@ -455,6 +457,8 @@ cdef class Doc:
        return None

    def range_from_indices(self, int start_idx, int end_idx):
+        """ Get tuple - span of token indices which correspond to
+            character indices (start_idx, end_idx) if such a span exists"""
        assert start_idx < end_idx
        cdef int i
        cdef int start = -1
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@ -14,14 +14,15 @@ from ..util import normalize_slice


 cdef class Span:
-    """A slice from a Doc object."""
+    """A slice from a Doc object. Internally keeps character offsets in order
+       to keep track of changes (merges) in the original Doc. Updates are
+       made in start and end property."""
    def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None,
                  vector_norm=None):
        if not (0 <= start <= end <= len(tokens)):
            raise IndexError

        self.doc = tokens
-        # keep char offsets - as these don't change when merging spans
        self.start_token = start
        self.start_idx = self.doc[start].idx
        self.end_token = end
@ -80,9 +81,14 @@ cdef class Span:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

    property start:
+        """ Get start token index of this span from the Doc."""
        def __get__(self):
-            # if we haven't merged anything below check is false - so we get start token
+            # if we have merged spans in Doc start might have changed.
+            # check if token start index is in doc index range and the token
+            # index is start_idx (it hasn't changed).
+            # Potential IndexError if only second condition was used
            if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx:
+                # go through tokens in Doc - find index of token equal to start_idx
                new_start = self.doc.token_index_start(self.start_idx)
                if new_start is not None:
                    self.start_token = new_start
@ -92,9 +98,14 @@ cdef class Span:
            return self.start_token

    property end:
+        """ Get end token index of this span from the Doc."""
        def __get__(self):
-            # if we haven't merged anything we have fast access
+            # if we have merged spans in Doc end will have changed.
+            # check if token end index is in doc index range and the token
+            # index is end_idx (it hasn't changed).
+            # Potential IndexError if only second condition was used
            if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx:
+                # go through tokens in Doc - find index of token equal to end_idx
                new_end = self.doc.token_index_end(self.end_idx)
                if new_end is not None:
                    self.end_token = new_end