diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 3676b35af..d02f305f4 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -367,6 +367,14 @@ def test_spans_by_character(doc): span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk" ) + # Span.char_span + alignment mode "contract" + span2 = doc[0:2].char_span( + span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract" + ) + assert span1.start_char == span2.start_char + assert span1.end_char == span2.end_char + assert span2.label_ == "GPE" + def test_span_to_array(doc): span = doc[1:-2] diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi index 9986a90e6..00226098a 100644 --- a/spacy/tokens/span.pyi +++ b/spacy/tokens/span.pyi @@ -98,6 +98,7 @@ class Span: label: Union[int, str] = ..., kb_id: Union[int, str] = ..., vector: Optional[Floats1d] = ..., + alignment_mode: str = ..., ) -> Span: ... @property def conjuncts(self) -> Tuple[Token]: ... diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 99a5f43bd..2912dd705 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -362,7 +362,7 @@ cdef class Span: result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) # ensure we get a scalar back (numpy does this automatically but cupy doesn't) return result.item() - + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document. @@ -639,7 +639,7 @@ cdef class Span: else: return self.doc[root] - def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0): + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict"): """Create a `Span` object from the slice `span.text[start : end]`. start (int): The index of the first character of the span. @@ -649,11 +649,16 @@ cdef class Span: kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + alignment_mode (str): How character indices are aligned to token + boundaries. Options: "strict" (character indices must be aligned + with token boundaries), "contract" (span of all tokens completely + within the character span), "expand" (span of all tokens at least + partially covered by the character span). Defaults to "strict". RETURNS (Span): The newly constructed object. """ start_idx += self.c.start_char end_idx += self.c.start_char - return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector) + return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode) @property def conjuncts(self): diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index bd7794edc..a135f5ec9 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -186,14 +186,15 @@ the character indices don't map to a valid span. > assert span.text == "New York" > ``` -| Name | Description | -| ----------- | ----------------------------------------------------------------------------------------- | -| `start` | The index of the first character of the span. ~~int~~ | -| `end` | The index of the last character after the span. ~~int~~ | -| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | -| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | -| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | -| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | +| Name | Description | +| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `start` | The index of the first character of the span. ~~int~~ | +| `end` | The index of the last character after the span. ~~int~~ | +| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | +| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | +| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | +| `alignment_mode` 3.5.1 | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | +| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | ## Span.similarity {id="similarity",tag="method",model="vectors"}