mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Add alignment mode error and fix Doc.char_span docs (#6820)
* Raise an error on an unrecognized alignment mode rather than defaulting to `strict` * Fix the `Doc.char_span` API doc alignment mode details
This commit is contained in:
parent
d5ef245bb1
commit
4096a79de7
|
@ -591,6 +591,7 @@ class Errors(object):
|
||||||
E200 = ("Specifying a base model with a pretrained component '{component}' "
|
E200 = ("Specifying a base model with a pretrained component '{component}' "
|
||||||
"can not be combined with adding a pretrained Tok2Vec layer.")
|
"can not be combined with adding a pretrained Tok2Vec layer.")
|
||||||
E201 = ("Span index out of range.")
|
E201 = ("Span index out of range.")
|
||||||
|
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -197,6 +197,12 @@ def test_spans_by_character(doc):
|
||||||
assert span1.end_char == span2.end_char
|
assert span1.end_char == span2.end_char
|
||||||
assert span2.label_ == "GPE"
|
assert span2.label_ == "GPE"
|
||||||
|
|
||||||
|
# unsupported alignment mode
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span2 = doc.char_span(
|
||||||
|
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_span_to_array(doc):
|
def test_span_to_array(doc):
|
||||||
span = doc[1:-2]
|
span = doc[1:-2]
|
||||||
|
|
|
@ -379,8 +379,9 @@ cdef class Doc:
|
||||||
label = self.vocab.strings.add(label)
|
label = self.vocab.strings.add(label)
|
||||||
if not isinstance(kb_id, int):
|
if not isinstance(kb_id, int):
|
||||||
kb_id = self.vocab.strings.add(kb_id)
|
kb_id = self.vocab.strings.add(kb_id)
|
||||||
if alignment_mode not in ("strict", "contract", "expand"):
|
alignment_modes = ("strict", "contract", "expand")
|
||||||
alignment_mode = "strict"
|
if alignment_mode not in alignment_modes:
|
||||||
|
raise ValueError(Errors.E202.format(mode=alignment_mode, modes=", ".join(alignment_modes)))
|
||||||
cdef int start = token_by_char(self.c, self.length, start_idx)
|
cdef int start = token_by_char(self.c, self.length, start_idx)
|
||||||
if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
|
if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -199,15 +199,15 @@ Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
|
||||||
> assert span.text == "New York"
|
> assert span.text == "New York"
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------ | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `start_idx` | int | The index of the first character of the span. |
|
| `start_idx` | int | The index of the first character of the span. |
|
||||||
| `end_idx` | int | The index of the last character after the span. |
|
| `end_idx` | int | The index of the last character after the span. |
|
||||||
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
|
| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
|
||||||
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
|
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
|
||||||
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
|
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
|
||||||
| `alignment_mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
|
| `alignment_mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "contract" (span of all tokens completely within the character span), "expand" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
|
||||||
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
||||||
|
|
||||||
## Doc.similarity {#similarity tag="method" model="vectors"}
|
## Doc.similarity {#similarity tag="method" model="vectors"}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user