mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
* span start, end -> properties. autoupdate after merge
This commit is contained in:
parent
562db6d2d0
commit
4be7fda453
|
@ -4,8 +4,10 @@ from .doc cimport Doc
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
cdef readonly Doc doc
|
cdef readonly Doc doc
|
||||||
cdef public int i
|
cdef public int i
|
||||||
cdef public int start
|
cdef public int start_token
|
||||||
cdef public int end
|
cdef public int end_token
|
||||||
|
cdef public int start_idx
|
||||||
|
cdef public int end_idx
|
||||||
cdef readonly int label
|
cdef readonly int label
|
||||||
|
|
||||||
cdef public _vector
|
cdef public _vector
|
||||||
|
|
|
@ -21,8 +21,10 @@ cdef class Span:
|
||||||
raise IndexError
|
raise IndexError
|
||||||
|
|
||||||
self.doc = tokens
|
self.doc = tokens
|
||||||
self.start = start
|
self.start_token = start
|
||||||
self.end = end
|
self.start_idx = self.doc[start].idx
|
||||||
|
self.end_token = end
|
||||||
|
self.end_idx = self.doc[end - 1].idx + len(self.doc[end - 1])
|
||||||
self.label = label
|
self.label = label
|
||||||
self._vector = vector
|
self._vector = vector
|
||||||
self._vector_norm = vector_norm
|
self._vector_norm = vector_norm
|
||||||
|
@ -76,6 +78,46 @@ cdef class Span:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
|
property start:
|
||||||
|
""" Get start token index of this span from the Doc."""
|
||||||
|
def __get__(self):
|
||||||
|
# first is the first token of the span - get it from the doc
|
||||||
|
first = None
|
||||||
|
if self.start_token < len(self.doc):
|
||||||
|
first = self.doc[self.start_token]
|
||||||
|
# if we have merged spans in Doc start might have changed.
|
||||||
|
# check if token start index is in doc index range and the token
|
||||||
|
# index is start_idx (it hasn't changed).
|
||||||
|
if first is None or first.idx != self.start_idx:
|
||||||
|
# go through tokens in Doc - find index of token equal to start_idx
|
||||||
|
new_start = self.doc.token_index_start(self.start_idx)
|
||||||
|
if new_start is not None:
|
||||||
|
self.start_token = new_start
|
||||||
|
else:
|
||||||
|
raise IndexError('Something went terribly wrong during a merge.'
|
||||||
|
'No token found with idx %s' % self.start_idx)
|
||||||
|
return self.start_token
|
||||||
|
|
||||||
|
property end:
|
||||||
|
""" Get end token index of this span from the Doc."""
|
||||||
|
def __get__(self):
|
||||||
|
# last is the last token of the span - get it from the doc
|
||||||
|
last = None
|
||||||
|
if self.end_token <= len(self.doc):
|
||||||
|
last = self.doc[self.end_token -1]
|
||||||
|
# if we have merged spans in Doc end will have changed.
|
||||||
|
# check if token end index is in doc index range and the token
|
||||||
|
# index is end_idx + len(last_token) (it hasn't changed).
|
||||||
|
if last is None or last.idx + len(last) != self.end_idx:
|
||||||
|
# go through tokens in Doc - find index of token equal to end_idx
|
||||||
|
new_end = self.doc.token_index_end(self.end_idx)
|
||||||
|
if new_end is not None:
|
||||||
|
self.end_token = new_end
|
||||||
|
else:
|
||||||
|
raise IndexError('Something went terribly wrong during a merge.'
|
||||||
|
'No token found with idx %s' % self.end_idx)
|
||||||
|
return self.end_token
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if self._vector is None:
|
if self._vector is None:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user