mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Add alignment mode error and fix Doc.char_span docs (#6820)
* Raise an error on an unrecognized alignment mode rather than defaulting to `strict` * Fix the `Doc.char_span` API doc alignment mode details
This commit is contained in:
parent
d5ef245bb1
commit
4096a79de7
|
@ -591,6 +591,7 @@ class Errors(object):
|
|||
E200 = ("Specifying a base model with a pretrained component '{component}' "
|
||||
"can not be combined with adding a pretrained Tok2Vec layer.")
|
||||
E201 = ("Span index out of range.")
|
||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -197,6 +197,12 @@ def test_spans_by_character(doc):
|
|||
assert span1.end_char == span2.end_char
|
||||
assert span2.label_ == "GPE"
|
||||
|
||||
# unsupported alignment mode
|
||||
with pytest.raises(ValueError):
|
||||
span2 = doc.char_span(
|
||||
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
|
||||
)
|
||||
|
||||
|
||||
def test_span_to_array(doc):
|
||||
span = doc[1:-2]
|
||||
|
|
|
@ -379,8 +379,9 @@ cdef class Doc:
|
|||
label = self.vocab.strings.add(label)
|
||||
if not isinstance(kb_id, int):
|
||||
kb_id = self.vocab.strings.add(kb_id)
|
||||
if alignment_mode not in ("strict", "contract", "expand"):
|
||||
alignment_mode = "strict"
|
||||
alignment_modes = ("strict", "contract", "expand")
|
||||
if alignment_mode not in alignment_modes:
|
||||
raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes)))
|
||||
cdef int start = token_by_char(self.c, self.length, start_idx)
|
||||
if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
|
||||
return None
|
||||
|
|
|
@ -199,15 +199,15 @@ Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start_idx` | int | The index of the first character of the span. |
|
||||
| `end_idx` | int | The index of the last character after the span. |
|
||||
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
|
||||
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
|
||||
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
|
||||
| `alignment_mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
|
||||
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
||||
| Name | Type | Description |
|
||||
| ------------------------------------ | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `start_idx` | int | The index of the first character of the span. |
|
||||
| `end_idx` | int | The index of the last character after the span. |
|
||||
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
|
||||
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
|
||||
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
|
||||
| `alignment_mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
|
||||
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
||||
|
||||
## Doc.similarity {#similarity tag="method" model="vectors"}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user