Add alignment_mode argument to Span.char_span() (#12145)

* Add alignment_mode argument to Span.char_span()

* Update website

* Update spacy/tokens/span.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Add test

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Simon Gurcke 2023-01-27 20:43:40 +10:00 committed by GitHub
parent c68e6b8a96
commit 774c10fa39
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 26 additions and 11 deletions

View File

@ -367,6 +367,14 @@ def test_spans_by_character(doc):
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk" span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
) )
# Span.char_span + alignment mode "contract"
span2 = doc[0:2].char_span(
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
)
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
def test_span_to_array(doc): def test_span_to_array(doc):
span = doc[1:-2] span = doc[1:-2]

View File

@ -98,6 +98,7 @@ class Span:
label: Union[int, str] = ..., label: Union[int, str] = ...,
kb_id: Union[int, str] = ..., kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ..., vector: Optional[Floats1d] = ...,
alignment_mode: str = ...,
) -> Span: ... ) -> Span: ...
@property @property
def conjuncts(self) -> Tuple[Token]: ... def conjuncts(self) -> Tuple[Token]: ...

View File

@ -362,7 +362,7 @@ cdef class Span:
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't) # ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item() return result.item()
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy """Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document. `ndarray` of shape `(N, M)`, where `N` is the length of the document.
@ -639,7 +639,7 @@ cdef class Span:
else: else:
return self.doc[root] return self.doc[root]
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0): def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict"):
"""Create a `Span` object from the slice `span.text[start : end]`. """Create a `Span` object from the slice `span.text[start : end]`.
start (int): The index of the first character of the span. start (int): The index of the first character of the span.
@ -649,11 +649,16 @@ cdef class Span:
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span. the span.
alignment_mode (str): How character indices are aligned to token
boundaries. Options: "strict" (character indices must be aligned
with token boundaries), "contract" (span of all tokens completely
within the character span), "expand" (span of all tokens at least
partially covered by the character span). Defaults to "strict".
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
""" """
start_idx += self.c.start_char start_idx += self.c.start_char
end_idx += self.c.start_char end_idx += self.c.start_char
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector) return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode)
@property @property
def conjuncts(self): def conjuncts(self):

View File

@ -186,14 +186,15 @@ the character indices don't map to a valid span.
> assert span.text == "New York" > assert span.text == "New York"
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | ----------------------------------------------------------------------------------------- | | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ | | `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~~int~~ | | `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ | | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
## Span.similarity {id="similarity",tag="method",model="vectors"} ## Span.similarity {id="similarity",tag="method",model="vectors"}