Add alignment mode error and fix Doc.char_span docs (#6820)

* Raise an error on an unrecognized alignment mode rather than
defaulting to `strict`
* Fix the `Doc.char_span` API doc alignment mode details
This commit is contained in:
Adriane Boyd 2021-01-27 13:40:42 +01:00 committed by GitHub
parent d5ef245bb1
commit 4096a79de7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 19 additions and 11 deletions

View File

@ -591,6 +591,7 @@ class Errors(object):
E200 = ("Specifying a base model with a pretrained component '{component}' "
"can not be combined with adding a pretrained Tok2Vec layer.")
E201 = ("Span index out of range.")
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
@add_codes

View File

@ -197,6 +197,12 @@ def test_spans_by_character(doc):
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
# unsupported alignment mode
with pytest.raises(ValueError):
span2 = doc.char_span(
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
)
def test_span_to_array(doc):
span = doc[1:-2]

View File

@ -379,8 +379,9 @@ cdef class Doc:
label = self.vocab.strings.add(label)
if not isinstance(kb_id, int):
kb_id = self.vocab.strings.add(kb_id)
if alignment_mode not in ("strict", "contract", "expand"):
alignment_mode = "strict"
alignment_modes = ("strict", "contract", "expand")
if alignment_mode not in alignment_modes:
raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes)))
cdef int start = token_by_char(self.c, self.length, start_idx)
if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
return None

View File

@ -200,13 +200,13 @@ Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
> ```
| Name | Type | Description |
| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ------------------------------------ | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `start_idx` | int | The index of the first character of the span. |
| `end_idx` | int | The index of the last character after the span. |
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
| `alignment_mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
| `alignment_mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
| **RETURNS** | `Span` | The newly constructed object or `None`. |
## Doc.similarity {#similarity tag="method" model="vectors"}