From 02f2af3ad8f7d24ce11d3dfc05b817ba41c93013 Mon Sep 17 00:00:00 2001 From: Simon Gurcke Date: Mon, 23 Jan 2023 11:25:51 +1000 Subject: [PATCH] Add alignment_mode argument to Span.char_span() --- spacy/tokens/span.pyi | 1 + spacy/tokens/span.pyx | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 9986a90e6..00226098a 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -98,6 +98,7 @@ class Span: label: Union[int, str] = ..., kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., + alignment_mode: str = ..., ) -> Span: ... @property def conjuncts(self) -> Tuple[Token]: ... diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 99a5f43bd..6e78b9b13 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -362,7 +362,7 @@ cdef class Span: result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) # ensure we get a scalar back (numpy does this automatically but cupy doesn't) return result.item() - + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document. @@ -639,7 +639,7 @@ cdef class Span: else: return self.doc[root] - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0): + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict", id=0): """Create a `Span` object from the slice `span.text[start : end]`. start (int): The index of the first character of the span. @@ -649,11 +649,16 @@ cdef class Span: kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + alignment_mode (str): How character indices are aligned to token + boundaries. Options: "strict" (character indices must be aligned + with token boundaries), "contract" (span of all tokens completely + within the character span), "expand" (span of all tokens at least + partially covered by the character span). Defaults to "strict". RETURNS (Span): The newly constructed object. """ start_idx += self.c.start_char end_idx += self.c.start_char - return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector) + return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode) @property def conjuncts(self):