From 3102e2e27a7a095cb695a21c1ef21e6efdce9f8a Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Mon, 20 Mar 2023 12:25:18 +0100
Subject: [PATCH 1/4] Entity linking: use `SpanGroup` instead of
 `Iterable[Span]` for mentions (#12344)

* Convert Candidate from Cython to Python class.

* Format.

* Fix .entity_ typo in _add_activations() usage.

* Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span].

* Update docs.

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update doc string of BaseCandidate.__init__().

* Update spacy/kb/candidate.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate.

* Adjust Candidate to support and mandate numerical entity IDs.

* Format.

* Fix docstring and docs.

* Update website/docs/api/kb.mdx

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Rename alias -> mention.

* Refactor Candidate attribute names. Update docs and tests accordingly.

* Refacor Candidate attributes and their usage.

* Format.

* Fix mypy error.

* Update error code in line with v4 convention.

* Reverse erroneous changes during merge.

* Update return type in EL tests.

* Re-add Candidate to setup.py.

* Format updated docs.

---------

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/kb/__init__.py                       |  1 -
 spacy/kb/kb.pyx                            |  6 +++---
 spacy/ml/models/entity_linker.py           |  8 ++++----
 spacy/pipeline/entity_linker.py            | 13 ++++++++-----
 spacy/tests/pipeline/test_entity_linker.py |  1 -
 website/docs/api/inmemorylookupkb.mdx      |  5 +++--
 website/docs/api/kb.mdx                    | 11 +++++------
 7 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py
index ff0e209e3..c8a657d62 100644
--- a/spacy/kb/__init__.py
+++ b/spacy/kb/__init__.py
@@ -2,5 +2,4 @@ from .kb import KnowledgeBase
 from .kb_in_memory import InMemoryLookupKB
 from .candidate import Candidate, InMemoryCandidate
 
-
 __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx
index 1cb08f488..2d0e1d5a1 100644
--- a/spacy/kb/kb.pyx
+++ b/spacy/kb/kb.pyx
@@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union
 from cymem.cymem cimport Pool
 
 from .candidate import Candidate
-from ..tokens import Span
+from ..tokens import Span, SpanGroup
 from ..util import SimpleFrozenList
 from ..errors import Errors
 
@@ -30,13 +30,13 @@ cdef class KnowledgeBase:
         self.entity_vector_length = entity_vector_length
         self.mem = Pool()
 
-    def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]:
+    def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]:
         """
         Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the
         entity's embedding vector. Depending on the KB implementation, further properties - such as the prior
         probability of the specified mention text resolving to that entity - might be included.
         If no candidates are found for a given mention, an empty list is returned.
-        mentions (Iterable[Span]): Mentions for which to get candidates.
+        mentions (SpanGroup): Mentions for which to get candidates.
         RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
         """
         return [self.get_candidates(span) for span in mentions]
diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 7fe0b4741..b5122b164 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -8,7 +8,7 @@ from ...util import registry
 from ...kb import KnowledgeBase, InMemoryLookupKB
 from ...kb import Candidate
 from ...vocab import Vocab
-from ...tokens import Span, Doc
+from ...tokens import Doc, Span, SpanGroup
 from ..extract_spans import extract_spans
 from ...errors import Errors
 
@@ -114,7 +114,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
 
 @registry.misc("spacy.CandidateBatchGenerator.v1")
 def create_candidates_batch() -> Callable[
-    [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+    [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
 ]:
     return get_candidates_batch
 
@@ -130,12 +130,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]:
 
 
 def get_candidates_batch(
-    kb: KnowledgeBase, mentions: Iterable[Span]
+    kb: KnowledgeBase, mentions: SpanGroup
 ) -> Iterable[Iterable[Candidate]]:
     """
     Return candidate entities for the given mentions and fetching appropriate entries from the index.
     kb (KnowledgeBase): Knowledge base to query.
-    mentions (Iterable[Span]): Entity mentions for which to identify candidates.
+    mentions (SpanGroup): Entity mentions for which to identify candidates.
     RETURNS (Iterable[Iterable[Candidate]]): Identified candidates.
     """
     return kb.get_candidates_batch(mentions)
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index caced9cfd..ecd156db5 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -11,6 +11,8 @@ from thinc.api import set_dropout_rate
 
 from ..kb import KnowledgeBase, Candidate
 from ..tokens import Doc, Span
+from ..ml import empty_kb
+from ..tokens import Doc, Span, SpanGroup
 from .pipe import deserialize_config
 from .trainable_pipe import TrainablePipe
 from ..language import Language
@@ -82,7 +84,7 @@ def make_entity_linker(
     entity_vector_length: int,
     get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
     get_candidates_batch: Callable[
-        [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+        [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
     ],
     generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
     overwrite: bool,
@@ -105,7 +107,7 @@ def make_entity_linker(
     get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
         produces a list of candidates, given a certain knowledge base and a textual mention.
     get_candidates_batch (
-        Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
+        Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]]
         ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
     generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
     scorer (Optional[Callable]): The scoring method.
@@ -170,7 +172,7 @@ class EntityLinker(TrainablePipe):
         entity_vector_length: int,
         get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
         get_candidates_batch: Callable[
-            [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]
+            [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]
         ],
         generate_empty_kb: Callable[[Vocab, int], KnowledgeBase],
         overwrite: bool = False,
@@ -194,7 +196,7 @@ class EntityLinker(TrainablePipe):
         get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
             produces a list of candidates, given a certain knowledge base and a textual mention.
         get_candidates_batch (
-            Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]],
+            Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]],
             Iterable[Candidate]]
             ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions.
         generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase.
@@ -473,7 +475,8 @@ class EntityLinker(TrainablePipe):
 
                 batch_candidates = list(
                     self.get_candidates_batch(
-                        self.kb, [ent_batch[idx] for idx in valid_ent_idx]
+                        self.kb,
+                        SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]),
                     )
                     if self.candidates_batch_size > 1
                     else [
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 65406a36e..773a5b8f3 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -997,7 +997,6 @@ def test_scorer_links():
 )
 # fmt: on
 def test_legacy_architectures(name, config):
-
     # Ensure that the legacy architectures still work
     vector_length = 3
     nlp = English()
diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx
index 6fa6cb235..3b33f7fb7 100644
--- a/website/docs/api/inmemorylookupkb.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@@ -189,14 +189,15 @@ to you.
 >
 > ```python
 > from spacy.lang.en import English
+> from spacy.tokens import SpanGroup
 > nlp = English()
 > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
-> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
 > ```
 
 | Name        | Description                                                                                                  |
 | ----------- | ------------------------------------------------------------------------------------------------------------ |
-| `mentions`  | The textual mentions. ~~Iterable[Span]~~                                                                     |
+| `mentions`  | The textual mentions. ~~SpanGroup~~                                                                          |
 | **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ |
 
 ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"}
diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx
index 9536a3fe3..94506162f 100644
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@@ -93,14 +93,15 @@ to you.
 >
 > ```python
 > from spacy.lang.en import English
+> from spacy.tokens import SpanGroup
 > nlp = English()
 > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.")
-> candidates = kb.get_candidates((doc[0:2], doc[3:]))
+> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]])
 > ```
 
 | Name        | Description                                                                                  |
 | ----------- | -------------------------------------------------------------------------------------------- |
-| `mentions`  | The textual mention or alias. ~~Iterable[Span]~~                                             |
+| `mentions`  | The textual mentions. ~~SpanGroup~~                                                          |
 | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ |
 
 ## KnowledgeBase.get_vector {id="get_vector",tag="method"}
@@ -187,13 +188,11 @@ Construct an `InMemoryCandidate` object. Usually this constructor is not called
 directly, but instead these objects are returned by the `get_candidates` method
 of the [`entity_linker`](/api/entitylinker) pipe.
 
-> #### Example```python
+> #### Example
 >
+> ```python
 > from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb,
 > entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
->
-> ```
->
 > ```
 
 | Name          | Description                                                               |

From a653dec6541c1a18ef820dc11ff7fb2a287c8665 Mon Sep 17 00:00:00 2001
From: Edward <43848523+thomashacker@users.noreply.github.com>
Date: Mon, 27 Mar 2023 09:18:23 +0200
Subject: [PATCH 2/4] Add info that Vocab and StringStore are not static in
 docs (#12427)

* Add size increase info about vocab and stringstore

* Update website/docs/api/stringstore.mdx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

* Update website/docs/api/vocab.mdx

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>

* Change wording

---------

Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
---
 website/docs/api/stringstore.mdx | 7 +++++++
 website/docs/api/vocab.mdx       | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx
index 7e380f5f8..2425c8adc 100644
--- a/website/docs/api/stringstore.mdx
+++ b/website/docs/api/stringstore.mdx
@@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
 integer IDs. This ensures that strings always map to the same ID, even from
 different `StringStores`.
 
+<Infobox variant="warning">
+
+Note that a `StringStore` instance is not static. It increases in size as texts
+with new tokens are processed.
+
+</Infobox>
+
 ## StringStore.\_\_init\_\_ {id="init",tag="method"}
 
 Create the `StringStore`.
diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx
index 304040f9c..1e32eb118 100644
--- a/website/docs/api/vocab.mdx
+++ b/website/docs/api/vocab.mdx
@@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
 [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
 between `Doc` objects.
 
+<Infobox variant="warning">
+
+Note that a `Vocab` instance is not static. It increases in size as texts with
+new tokens are processed.
+
+</Infobox>
+
 ## Vocab.\_\_init\_\_ {id="init",tag="method"}
 
 Create the vocabulary.

From b734e5314d3b8faa7d463c265db9a823a113165c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 30 Mar 2023 09:30:42 +0200
Subject: [PATCH 3/4] Avoid `TrainablePipe.finish_update` getting called twice
 during training (#12450)

* Avoid `TrainablePipe.finish_update` getting called twice during training

PR #12136 fixed an issue where the tok2vec pipe was updated before
gradient were accumulated. However, it introduced a new bug that cause
`finish_update` to be called twice when using the training loop. This
causes a fairly large slowdown.

The `Language.update` method accepts the `sgd` argument for passing an
optimizer. This argument has three possible values:

- `Optimizer`: use the given optimizer to finish pipe updates.
- `None`: use a default optimizer to finish pipe updates.
- `False`: do not finish pipe updates.

However, the latter option was not documented and not valid with the
existing type of `sgd`. I assumed that this was a remnant of earlier
spaCy versions and removed handling of `False`.

However, with that change, we are passing `None` to `Language.update`.
As a result, we were calling `finish_update` in both `Language.update`
and in the training loop after all subbatches are processed.

This change restores proper handling/use of `False`. Moreover, the role
of `False` is now documented and added to the type to avoid future
accidents.

* Fix typo

* Document defaults for `Language.update`
---
 spacy/language.py             |  7 +++++--
 spacy/tests/test_language.py  | 18 ++++++++++++++++++
 spacy/training/loop.py        |  2 +-
 website/docs/api/language.mdx | 18 +++++++++---------
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 3b86fdde7..ce3630629 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1202,7 +1202,7 @@ class Language:
         _: Optional[Any] = None,
         *,
         drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
         losses: Optional[Dict[str, float]] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         exclude: Iterable[str] = SimpleFrozenList(),
@@ -1213,7 +1213,9 @@ class Language:
         examples (Iterable[Example]): A batch of examples
         _: Should not be set - serves to catch backwards-incompatible scripts.
         drop (float): The dropout rate.
-        sgd (Optimizer): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
+            be created via create_optimizer if 'None'. No optimizer will
+            be used when set to 'False'.
         losses (Dict[str, float]): Dictionary to update with the loss, keyed by
             component.
         component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
@@ -1272,6 +1274,7 @@ class Language:
                 name not in exclude
                 and isinstance(proc, ty.TrainableComponent)
                 and proc.is_trainable
+                and sgd not in (None, False)
             ):
                 proc.finish_update(sgd)
 
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 9b8c7b9c7..08a7d28a4 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -157,6 +157,24 @@ def test_language_update_updates():
     )
 
 
+def test_language_update_does_not_update_with_sgd_false():
+    config = Config().from_str(TAGGER_CFG_STRING)
+    nlp = load_model_from_config(config, auto_fill=True, validate=True)
+
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+    nlp.update(train_examples, sgd=False)
+    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+
+    xp = get_array_module(docs_after_update[0].tensor)
+    xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
+
+
 def test_language_evaluate(nlp):
     text = "hello world"
     annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index c737d7c01..587a2516c 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -210,7 +210,7 @@ def train_while_improving(
                 subbatch,
                 drop=dropout,
                 losses=losses,
-                sgd=None,
+                sgd=False,
                 exclude=exclude,
                 annotates=annotating_components,
             )
diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx
index c25bfcee5..5cd9e4af8 100644
--- a/website/docs/api/language.mdx
+++ b/website/docs/api/language.mdx
@@ -323,15 +323,15 @@ and custom registered functions if needed. See the
 >     nlp.update([example], sgd=optimizer)
 > ```
 
-| Name            | Description                                                                                                                                    |
-| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                              |
-| _keyword-only_  |                                                                                                                                                |
-| `drop`          | The dropout rate. ~~float~~                                                                                                                    |
-| `sgd`           | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                  |
-| `losses`        | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~                                                |
-| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
-| **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                          |
+| Name            | Description                                                                                                                                                                                        |
+| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                                                                                  |
+| _keyword-only_  |                                                                                                                                                                                                    |
+| `drop`          | The dropout rate. Defaults to `0.0`. ~~float~~                                                                                                                                                     |
+| `sgd`           | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ |
+| `losses`        | Dictionary to update with the loss, keyed by pipeline component. Defaults to `None`. ~~Optional[Dict[str, float]]~~                                                                                |
+| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                                                     |
+| **RETURNS**     | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                                              |
 
 ## Language.distill {id="distill",tag="method,experimental",version="4"}
 

From 5d0f48fe69223a8bb95a81734ef61a6d3c2aefa0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 6 Apr 2023 16:01:59 +0200
Subject: [PATCH 4/4] Enforce that Span.start/end(_char) remain valid and in
 sync (#12268)

* Enforce that Span.start/end(_char) remain valid and in sync

Allowing span attributes to be writable starting in v3 has made it
possible for the internal `Span.start/end/start_char/end_char` to get
out-of-sync or have invalid values.

This checks that the values are valid and syncs the token and char
offsets if any attributes are modified directly. It does not yet handle
the case where the underlying doc is modified.

* Format
---
 spacy/errors.py              |  5 +++-
 spacy/tests/doc/test_span.py | 47 ++++++++++++++++++++++++++++++++++
 spacy/tokens/span.pyx        | 49 +++++++++++++++++++++++++++---------
 3 files changed, 88 insertions(+), 13 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index e1f7e7400..5c9f41b7e 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -926,7 +926,7 @@ class Errors(metaclass=ErrorsWithCodes):
     E1029 = ("Edit tree cannot be applied to form.")
     E1030 = ("Edit tree identifier out of range.")
     E1031 = ("Could not find gold transition - see logs above.")
-    E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
+    E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
     E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
     E1034 = ("Node index {i} out of bounds ({length})")
     E1035 = ("Token index {i} out of bounds ({length})")
@@ -966,6 +966,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E4004 = ("Backprop is not supported when is_train is not set.")
     E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
     E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
+    E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
+             "{existing_value}.")
+    E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
 
 
 RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index a99f8b561..8eabcd645 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -707,3 +707,50 @@ def test_span_ent_id(en_tokenizer):
     doc.ents = [span]
     assert doc.ents[0].ent_id_ == "ID2"
     assert doc[1].ent_id_ == "ID2"
+
+
+def test_span_start_end_sync(en_tokenizer):
+    doc = en_tokenizer("a bc def e fghij kl")
+    # can create and edit span starts/ends
+    span = doc[2:4]
+    span.start_char = 2
+    span.end = 5
+    assert span == doc[span.start : span.end]
+    assert span == doc.char_span(span.start_char, span.end_char)
+    # cannot set completely out of bounds starts/ends
+    with pytest.raises(IndexError):
+        span.start = -1
+    with pytest.raises(IndexError):
+        span.end = -1
+    with pytest.raises(IndexError):
+        span.start_char = len(doc.text) + 1
+    with pytest.raises(IndexError):
+        span.end = len(doc.text) + 1
+    # test all possible char starts/ends
+    span = doc[0 : len(doc)]
+    token_char_starts = [token.idx for token in doc]
+    token_char_ends = [token.idx + len(token.text) for token in doc]
+    for i in range(len(doc.text)):
+        if i not in token_char_starts:
+            with pytest.raises(ValueError):
+                span.start_char = i
+        else:
+            span.start_char = i
+    span = doc[0 : len(doc)]
+    for i in range(len(doc.text)):
+        if i not in token_char_ends:
+            with pytest.raises(ValueError):
+                span.end_char = i
+        else:
+            span.end_char = i
+    # start must be <= end
+    span = doc[1:3]
+    with pytest.raises(ValueError):
+        span.start = 4
+    with pytest.raises(ValueError):
+        span.end = 0
+    span = doc.char_span(2, 8)
+    with pytest.raises(ValueError):
+        span.start_char = 9
+    with pytest.raises(ValueError):
+        span.end_char = 1
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 75f7db7ca..3d64a24a5 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -772,36 +772,61 @@ cdef class Span:
             return self.span_c().start
 
         def __set__(self, int start):
-            if start < 0:
-                raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
-            self.span_c().start = start
+            if start < 0 or start > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
+            cdef SpanC* span_c = self.span_c()
+            if start > span_c.end:
+                raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
+            span_c.start = start
+            span_c.start_char = self.doc.c[start].idx
 
     property end:
         def __get__(self):
             return self.span_c().end
 
         def __set__(self, int end):
-            if end < 0:
-                raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
-            self.span_c().end = end
+            if end < 0 or end > self.doc.length:
+                raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start > end:
+                raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
+            span_c.end = end
+            if end > 0:
+                span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
+            else:
+                span_c.end_char = 0
 
     property start_char:
         def __get__(self):
             return self.span_c().start_char
 
         def __set__(self, int start_char):
-            if start_char < 0:
-                raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
-            self.span_c().start_char = start_char
+            if start_char < 0 or start_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
+            cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
+            if start < 0:
+                raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
+            cdef SpanC* span_c = self.span_c()
+            if start_char > span_c.end_char:
+                raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
+            span_c.start_char = start_char
+            span_c.start = start
 
     property end_char:
         def __get__(self):
             return self.span_c().end_char
 
         def __set__(self, int end_char):
-            if end_char < 0:
-                raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
-            self.span_c().end_char = end_char
+            if end_char < 0 or end_char > len(self.doc.text):
+                raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
+            cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
+            if end < 0:
+                raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
+            cdef SpanC* span_c = self.span_c()
+            if span_c.start_char > end_char:
+                raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
+            span_c.end_char = end_char
+            span_c.end = end
 
     property label:
         def __get__(self):