mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-07 05:40:20 +03:00
added comments
This commit is contained in:
parent
91d74ed8ee
commit
015a84a5ec
|
@ -441,6 +441,7 @@ cdef class Doc:
|
|||
|
||||
|
||||
def token_index_start(self, int start_idx):
|
||||
""" Get index of token in doc that has character index start_idx """
|
||||
cdef int i
|
||||
for i in range(self.length):
|
||||
if self.data[i].idx == start_idx:
|
||||
|
@ -448,6 +449,7 @@ cdef class Doc:
|
|||
return None
|
||||
|
||||
def token_index_end(self, int end_idx):
|
||||
""" Get index+1 of token in doc ending with character index end_idx """
|
||||
cdef int i
|
||||
for i in range(self.length):
|
||||
if (self.data[i].idx + self.data[i].lex.length) == end_idx:
|
||||
|
@ -455,6 +457,8 @@ cdef class Doc:
|
|||
return None
|
||||
|
||||
def range_from_indices(self, int start_idx, int end_idx):
|
||||
""" Get tuple - span of token indices which correspond to
|
||||
character indices (start_idx, end_idx) if such a span exists"""
|
||||
assert start_idx < end_idx
|
||||
cdef int i
|
||||
cdef int start = -1
|
||||
|
|
|
@ -14,14 +14,15 @@ from ..util import normalize_slice
|
|||
|
||||
|
||||
cdef class Span:
|
||||
"""A slice from a Doc object."""
|
||||
"""A slice from a Doc object. Internally keeps character offsets in order
|
||||
to keep track of changes (merges) in the original Doc. Updates are
|
||||
made in start and end property."""
|
||||
def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None,
|
||||
vector_norm=None):
|
||||
if not (0 <= start <= end <= len(tokens)):
|
||||
raise IndexError
|
||||
|
||||
self.doc = tokens
|
||||
# keep char offsets - as these don't change when merging spans
|
||||
self.start_token = start
|
||||
self.start_idx = self.doc[start].idx
|
||||
self.end_token = end
|
||||
|
@ -80,9 +81,14 @@ cdef class Span:
|
|||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
property start:
|
||||
""" Get start token index of this span from the Doc."""
|
||||
def __get__(self):
|
||||
# if we haven't merged anything below check is false - so we get start token
|
||||
# if we have merged spans in Doc start might have changed.
|
||||
# check if token start index is in doc index range and the token
|
||||
# index is start_idx (it hasn't changed).
|
||||
# Potential IndexError if only second condition was used
|
||||
if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx:
|
||||
# go through tokens in Doc - find index of token equal to start_idx
|
||||
new_start = self.doc.token_index_start(self.start_idx)
|
||||
if new_start is not None:
|
||||
self.start_token = new_start
|
||||
|
@ -92,9 +98,14 @@ cdef class Span:
|
|||
return self.start_token
|
||||
|
||||
property end:
|
||||
""" Get end token index of this span from the Doc."""
|
||||
def __get__(self):
|
||||
# if we haven't merged anything we have fast access
|
||||
# if we have merged spans in Doc end will have changed.
|
||||
# check if token end index is in doc index range and the token
|
||||
# index is end_idx (it hasn't changed).
|
||||
# Potential IndexError if only second condition was used
|
||||
if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx:
|
||||
# go through tokens in Doc - find index of token equal to end_idx
|
||||
new_end = self.doc.token_index_end(self.end_idx)
|
||||
if new_end is not None:
|
||||
self.end_token = new_end
|
||||
|
|
Loading…
Reference in New Issue
Block a user