diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 25fa421b7..107078df9 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -167,11 +167,30 @@ def test_spans_are_hashable(en_tokenizer):
def test_spans_by_character(doc):
span1 = doc[1:-2]
+
+ # default and specified alignment mode "strict"
span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE")
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"
+ span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE", alignment_mode="strict")
+ assert span1.start_char == span2.start_char
+ assert span1.end_char == span2.end_char
+ assert span2.label_ == "GPE"
+
+ # alignment mode "contract"
+ span2 = doc.char_span(span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract")
+ assert span1.start_char == span2.start_char
+ assert span1.end_char == span2.end_char
+ assert span2.label_ == "GPE"
+
+ # alignment mode "expand"
+ span2 = doc.char_span(span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="expand")
+ assert span1.start_char == span2.start_char
+ assert span1.end_char == span2.end_char
+ assert span2.label_ == "GPE"
+
def test_span_to_array(doc):
span = doc[1:-2]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 5b03dc5d2..89573ba09 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -352,17 +352,25 @@ cdef class Doc:
def doc(self):
return self
- def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
- """Create a `Span` object from the slice `doc.text[start : end]`.
+ def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, alignment_mode="strict"):
+ """Create a `Span` object from the slice
+ `doc.text[start_idx : end_idx]`. Returns None if no valid `Span` can be
+ created.
doc (Doc): The parent document.
- start (int): The index of the first character of the span.
- end (int): The index of the first character after the span.
+ start_idx (int): The index of the first character of the span.
+ end_idx (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for
named entities.
- kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
+ kb_id (uint64 or string): An ID from a KB to capture the meaning of a
+ named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
+ alignment_mode (str): How character indices are aligned to token
+ boundaries. Options: "strict" (character indices must be aligned
+ with token boundaries), "contract" (span of all tokens completely
+ within the character span), "expand" (span of all tokens at least
+ partially covered by the character span). Defaults to "strict".
RETURNS (Span): The newly constructed object.
DOCS: https://spacy.io/api/doc#char_span
@@ -371,12 +379,29 @@ cdef class Doc:
label = self.vocab.strings.add(label)
if not isinstance(kb_id, int):
kb_id = self.vocab.strings.add(kb_id)
- cdef int start = token_by_start(self.c, self.length, start_idx)
- if start == -1:
+ if alignment_mode not in ("strict", "contract", "expand"):
+ alignment_mode = "strict"
+ cdef int start = token_by_char(self.c, self.length, start_idx)
+ if start < 0 or (alignment_mode == "strict" and start_idx != self[start].idx):
return None
- cdef int end = token_by_end(self.c, self.length, end_idx)
- if end == -1:
+ # end_idx is exclusive, so find the token at one char before
+ cdef int end = token_by_char(self.c, self.length, end_idx - 1)
+ if end < 0 or (alignment_mode == "strict" and end_idx != self[end].idx + len(self[end])):
return None
+ # Adjust start and end by alignment_mode
+ if alignment_mode == "contract":
+ if self[start].idx < start_idx:
+ start += 1
+ if end_idx < self[end].idx + len(self[end]):
+ end -= 1
+ # if no tokens are completely within the span, return None
+ if end < start:
+ return None
+ elif alignment_mode == "expand":
+ # Don't consider the trailing whitespace to be part of the previous
+ # token
+ if start_idx == self[start].idx + len(self[start]):
+ start += 1
# Currently we have the token index, we want the range-end index
end += 1
cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
@@ -1167,23 +1192,35 @@ cdef class Doc:
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
- cdef int i
- for i in range(length):
- if tokens[i].idx == start_char:
- return i
+ cdef int i = token_by_char(tokens, length, start_char)
+ if i >= 0 and tokens[i].idx == start_char:
+ return i
else:
return -1
cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2:
- cdef int i
- for i in range(length):
- if tokens[i].idx + tokens[i].lex.length == end_char:
- return i
+ # end_char is exclusive, so find the token at one char before
+ cdef int i = token_by_char(tokens, length, end_char - 1)
+ if i >= 0 and tokens[i].idx + tokens[i].lex.length == end_char:
+ return i
else:
return -1
+cdef int token_by_char(const TokenC* tokens, int length, int char_idx) except -2:
+ cdef int start = 0, mid, end = length - 1
+ while start <= end:
+ mid = (start + end) / 2
+ if char_idx < tokens[mid].idx:
+ end = mid - 1
+ elif char_idx >= tokens[mid].idx + tokens[mid].lex.length + tokens[mid].spacy:
+ start = mid + 1
+ else:
+ return mid
+ return -1
+
+
cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
cdef TokenC* head
cdef TokenC* child
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 7decc2278..420e12fcb 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -187,8 +187,9 @@ Remove a previously registered extension.
## Doc.char_span {#char_span tag="method" new="2"}
-Create a `Span` object from the slice `doc.text[start:end]`. Returns `None` if
-the character indices don't map to a valid span.
+Create a `Span` object from the slice `doc.text[start_idx:end_idx]`. Returns
+`None` if the character indices don't map to a valid span using the default mode
+`"strict".
> #### Example
>
@@ -198,14 +199,15 @@ the character indices don't map to a valid span.
> assert span.text == "New York"
> ```
-| Name | Type | Description |
-| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
-| `start` | int | The index of the first character of the span. |
-| `end` | int | The index of the last character after the span. |
-| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
-| `kb_id` 2.2 | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
-| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
-| **RETURNS** | `Span` | The newly constructed object or `None`. |
+| Name | Type | Description |
+| ------------------------------------ | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `start_idx` | int | The index of the first character of the span. |
+| `end_idx` | int | The index of the last character after the span. |
+| `label` | uint64 / unicode | A label to attach to the span, e.g. for named entities. |
+| `kb_id` 2.2 | uint64 / unicode | An ID from a knowledge base to capture the meaning of a named entity. |
+| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. |
+| `mode` | `str` | How character indices snap to token boundaries. Options: "strict" (no snapping), "inside" (span of all tokens completely within the character span), "outside" (span of all tokens at least partially covered by the character span). Defaults to "strict". |
+| **RETURNS** | `Span` | The newly constructed object or `None`. |
## Doc.similarity {#similarity tag="method" model="vectors"}
@@ -646,26 +648,26 @@ The L2 norm of the document's vector representation.
## Attributes {#attributes}
-| Name | Type | Description |
-| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `text` | unicode | A unicode representation of the document text. |
-| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
-| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
-| `vocab` | `Vocab` | The store of lexical types. |
-| `tensor` 2 | `ndarray` | Container for dense vector representations. |
-| `cats` 2 | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. |
-| `user_data` | - | A generic storage area, for user custom data. |
-| `lang` 2.1 | int | Language of the document's vocabulary. |
-| `lang_` 2.1 | unicode | Language of the document's vocabulary. |
-| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
-| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
-| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
-| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. |
-| `sentiment` | float | The document's positivity/negativity score, if available. |
-| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
-| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
-| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
-| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
+| Name | Type | Description |
+| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `text` | unicode | A unicode representation of the document text. |
+| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
+| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
+| `vocab` | `Vocab` | The store of lexical types. |
+| `tensor` 2 | `ndarray` | Container for dense vector representations. |
+| `cats` 2 | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. |
+| `user_data` | - | A generic storage area, for user custom data. |
+| `lang` 2.1 | int | Language of the document's vocabulary. |
+| `lang_` 2.1 | unicode | Language of the document's vocabulary. |
+| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
+| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
+| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
+| `is_nered` 2.1 | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. |
+| `sentiment` | float | The document's positivity/negativity score, if available. |
+| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
+| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
+| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
+| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
## Serialization fields {#serialization-fields}