From 3102e2e27a7a095cb695a21c1ef21e6efdce9f8a Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Mon, 20 Mar 2023 12:25:18 +0100 Subject: [PATCH 1/4] Entity linking: use `SpanGroup` instead of `Iterable[Span]` for mentions (#12344) * Convert Candidate from Cython to Python class. * Format. * Fix .entity_ typo in _add_activations() usage. * Change type for mentions to look up entity candidates for to SpanGroup from Iterable[Span]. * Update docs. * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem * Update doc string of BaseCandidate.__init__(). * Update spacy/kb/candidate.py Co-authored-by: Sofie Van Landeghem * Rename Candidate to InMemoryCandidate, BaseCandidate to Candidate. * Adjust Candidate to support and mandate numerical entity IDs. * Format. * Fix docstring and docs. * Update website/docs/api/kb.mdx Co-authored-by: Sofie Van Landeghem * Rename alias -> mention. * Refactor Candidate attribute names. Update docs and tests accordingly. * Refacor Candidate attributes and their usage. * Format. * Fix mypy error. * Update error code in line with v4 convention. * Reverse erroneous changes during merge. * Update return type in EL tests. * Re-add Candidate to setup.py. * Format updated docs. --------- Co-authored-by: Sofie Van Landeghem --- spacy/kb/__init__.py | 1 - spacy/kb/kb.pyx | 6 +++--- spacy/ml/models/entity_linker.py | 8 ++++---- spacy/pipeline/entity_linker.py | 13 ++++++++----- spacy/tests/pipeline/test_entity_linker.py | 1 - website/docs/api/inmemorylookupkb.mdx | 5 +++-- website/docs/api/kb.mdx | 11 +++++------ 7 files changed, 23 insertions(+), 22 deletions(-) diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py index ff0e209e3..c8a657d62 100644 --- a/spacy/kb/__init__.py +++ b/spacy/kb/__init__.py @@ -2,5 +2,4 @@ from .kb import KnowledgeBase from .kb_in_memory import InMemoryLookupKB from .candidate import Candidate, InMemoryCandidate - __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 1cb08f488..2d0e1d5a1 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -5,7 +5,7 @@ from typing import Iterable, Tuple, Union from cymem.cymem cimport Pool from .candidate import Candidate -from ..tokens import Span +from ..tokens import Span, SpanGroup from ..util import SimpleFrozenList from ..errors import Errors @@ -30,13 +30,13 @@ cdef class KnowledgeBase: self.entity_vector_length = entity_vector_length self.mem = Pool() - def get_candidates_batch(self, mentions: Iterable[Span]) -> Iterable[Iterable[Candidate]]: + def get_candidates_batch(self, mentions: SpanGroup) -> Iterable[Iterable[Candidate]]: """ Return candidate entities for a specified Span mention. Each candidate defines at least the entity and the entity's embedding vector. Depending on the KB implementation, further properties - such as the prior probability of the specified mention text resolving to that entity - might be included. If no candidates are found for a given mention, an empty list is returned. - mentions (Iterable[Span]): Mentions for which to get candidates. + mentions (SpanGroup): Mentions for which to get candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. """ return [self.get_candidates(span) for span in mentions] diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 7fe0b4741..b5122b164 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -8,7 +8,7 @@ from ...util import registry from ...kb import KnowledgeBase, InMemoryLookupKB from ...kb import Candidate from ...vocab import Vocab -from ...tokens import Span, Doc +from ...tokens import Doc, Span, SpanGroup from ..extract_spans import extract_spans from ...errors import Errors @@ -114,7 +114,7 @@ def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]: @registry.misc("spacy.CandidateBatchGenerator.v1") def create_candidates_batch() -> Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] + [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] ]: return get_candidates_batch @@ -130,12 +130,12 @@ def get_candidates(kb: KnowledgeBase, mention: Span) -> Iterable[Candidate]: def get_candidates_batch( - kb: KnowledgeBase, mentions: Iterable[Span] + kb: KnowledgeBase, mentions: SpanGroup ) -> Iterable[Iterable[Candidate]]: """ Return candidate entities for the given mentions and fetching appropriate entries from the index. kb (KnowledgeBase): Knowledge base to query. - mentions (Iterable[Span]): Entity mentions for which to identify candidates. + mentions (SpanGroup): Entity mentions for which to identify candidates. RETURNS (Iterable[Iterable[Candidate]]): Identified candidates. """ return kb.get_candidates_batch(mentions) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index caced9cfd..ecd156db5 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -11,6 +11,8 @@ from thinc.api import set_dropout_rate from ..kb import KnowledgeBase, Candidate from ..tokens import Doc, Span +from ..ml import empty_kb +from ..tokens import Doc, Span, SpanGroup from .pipe import deserialize_config from .trainable_pipe import TrainablePipe from ..language import Language @@ -82,7 +84,7 @@ def make_entity_linker( entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_batch: Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] + [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] ], generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, @@ -105,7 +107,7 @@ def make_entity_linker( get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. get_candidates_batch ( - Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] + Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. @@ -170,7 +172,7 @@ class EntityLinker(TrainablePipe): entity_vector_length: int, get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]], get_candidates_batch: Callable[ - [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] + [KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]] ], generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool = False, @@ -194,7 +196,7 @@ class EntityLinker(TrainablePipe): get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that produces a list of candidates, given a certain knowledge base and a textual mention. get_candidates_batch ( - Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], + Callable[[KnowledgeBase, SpanGroup], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. @@ -473,7 +475,8 @@ class EntityLinker(TrainablePipe): batch_candidates = list( self.get_candidates_batch( - self.kb, [ent_batch[idx] for idx in valid_ent_idx] + self.kb, + SpanGroup(doc, spans=[ent_batch[idx] for idx in valid_ent_idx]), ) if self.candidates_batch_size > 1 else [ diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 65406a36e..773a5b8f3 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -997,7 +997,6 @@ def test_scorer_links(): ) # fmt: on def test_legacy_architectures(name, config): - # Ensure that the legacy architectures still work vector_length = 3 nlp = English() diff --git a/website/docs/api/inmemorylookupkb.mdx b/website/docs/api/inmemorylookupkb.mdx index 6fa6cb235..3b33f7fb7 100644 --- a/website/docs/api/inmemorylookupkb.mdx +++ b/website/docs/api/inmemorylookupkb.mdx @@ -189,14 +189,15 @@ to you. > > ```python > from spacy.lang.en import English +> from spacy.tokens import SpanGroup > nlp = English() > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.") -> candidates = kb.get_candidates((doc[0:2], doc[3:])) +> candidates = kb.get_candidates_batch([SpanGroup(doc, spans=[doc[0:2], doc[3:]]]) > ``` | Name | Description | | ----------- | ------------------------------------------------------------------------------------------------------------ | -| `mentions` | The textual mentions. ~~Iterable[Span]~~ | +| `mentions` | The textual mentions. ~~SpanGroup~~ | | **RETURNS** | An iterable of iterable with relevant `InMemoryCandidate` objects. ~~Iterable[Iterable[InMemoryCandidate]]~~ | ## InMemoryLookupKB.get_vector {id="get_vector",tag="method"} diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx index 9536a3fe3..94506162f 100644 --- a/website/docs/api/kb.mdx +++ b/website/docs/api/kb.mdx @@ -93,14 +93,15 @@ to you. > > ```python > from spacy.lang.en import English +> from spacy.tokens import SpanGroup > nlp = English() > doc = nlp("Douglas Adams wrote 'The Hitchhiker's Guide to the Galaxy'.") -> candidates = kb.get_candidates((doc[0:2], doc[3:])) +> candidates = kb.get_candidates([SpanGroup(doc, spans=[doc[0:2], doc[3:]]]) > ``` | Name | Description | | ----------- | -------------------------------------------------------------------------------------------- | -| `mentions` | The textual mention or alias. ~~Iterable[Span]~~ | +| `mentions` | The textual mentions. ~~SpanGroup~~ | | **RETURNS** | An iterable of iterable with relevant `Candidate` objects. ~~Iterable[Iterable[Candidate]]~~ | ## KnowledgeBase.get_vector {id="get_vector",tag="method"} @@ -187,13 +188,11 @@ Construct an `InMemoryCandidate` object. Usually this constructor is not called directly, but instead these objects are returned by the `get_candidates` method of the [`entity_linker`](/api/entitylinker) pipe. -> #### Example```python +> #### Example > +> ```python > from spacy.kb import InMemoryCandidate candidate = InMemoryCandidate(kb, > entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) -> -> ``` -> > ``` | Name | Description | From a653dec6541c1a18ef820dc11ff7fb2a287c8665 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Mon, 27 Mar 2023 09:18:23 +0200 Subject: [PATCH 2/4] Add info that Vocab and StringStore are not static in docs (#12427) * Add size increase info about vocab and stringstore * Update website/docs/api/stringstore.mdx Co-authored-by: Raphael Mitsch * Update website/docs/api/vocab.mdx Co-authored-by: Raphael Mitsch * Change wording --------- Co-authored-by: Raphael Mitsch --- website/docs/api/stringstore.mdx | 7 +++++++ website/docs/api/vocab.mdx | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx index 7e380f5f8..2425c8adc 100644 --- a/website/docs/api/stringstore.mdx +++ b/website/docs/api/stringstore.mdx @@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of integer IDs. This ensures that strings always map to the same ID, even from different `StringStores`. + + +Note that a `StringStore` instance is not static. It increases in size as texts +with new tokens are processed. + + + ## StringStore.\_\_init\_\_ {id="init",tag="method"} Create the `StringStore`. diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 304040f9c..1e32eb118 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared between `Doc` objects. + + +Note that a `Vocab` instance is not static. It increases in size as texts with +new tokens are processed. + + + ## Vocab.\_\_init\_\_ {id="init",tag="method"} Create the vocabulary. From b734e5314d3b8faa7d463c265db9a823a113165c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 30 Mar 2023 09:30:42 +0200 Subject: [PATCH 3/4] Avoid `TrainablePipe.finish_update` getting called twice during training (#12450) * Avoid `TrainablePipe.finish_update` getting called twice during training PR #12136 fixed an issue where the tok2vec pipe was updated before gradient were accumulated. However, it introduced a new bug that cause `finish_update` to be called twice when using the training loop. This causes a fairly large slowdown. The `Language.update` method accepts the `sgd` argument for passing an optimizer. This argument has three possible values: - `Optimizer`: use the given optimizer to finish pipe updates. - `None`: use a default optimizer to finish pipe updates. - `False`: do not finish pipe updates. However, the latter option was not documented and not valid with the existing type of `sgd`. I assumed that this was a remnant of earlier spaCy versions and removed handling of `False`. However, with that change, we are passing `None` to `Language.update`. As a result, we were calling `finish_update` in both `Language.update` and in the training loop after all subbatches are processed. This change restores proper handling/use of `False`. Moreover, the role of `False` is now documented and added to the type to avoid future accidents. * Fix typo * Document defaults for `Language.update` --- spacy/language.py | 7 +++++-- spacy/tests/test_language.py | 18 ++++++++++++++++++ spacy/training/loop.py | 2 +- website/docs/api/language.mdx | 18 +++++++++--------- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 3b86fdde7..ce3630629 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1202,7 +1202,7 @@ class Language: _: Optional[Any] = None, *, drop: float = 0.0, - sgd: Optional[Optimizer] = None, + sgd: Union[Optimizer, None, Literal[False]] = None, losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, exclude: Iterable[str] = SimpleFrozenList(), @@ -1213,7 +1213,9 @@ class Language: examples (Iterable[Example]): A batch of examples _: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. - sgd (Optimizer): An optimizer. + sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will + be created via create_optimizer if 'None'. No optimizer will + be used when set to 'False'. losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. component_cfg (Dict[str, Dict]): Config parameters for specific pipeline @@ -1272,6 +1274,7 @@ class Language: name not in exclude and isinstance(proc, ty.TrainableComponent) and proc.is_trainable + and sgd not in (None, False) ): proc.finish_update(sgd) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 9b8c7b9c7..08a7d28a4 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -157,6 +157,24 @@ def test_language_update_updates(): ) +def test_language_update_does_not_update_with_sgd_false(): + config = Config().from_str(TAGGER_CFG_STRING) + nlp = load_model_from_config(config, auto_fill=True, validate=True) + + train_examples = [] + for t in TAGGER_TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + nlp.initialize(get_examples=lambda: train_examples) + + docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + nlp.update(train_examples, sgd=False) + docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + + xp = get_array_module(docs_after_update[0].tensor) + xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor) + + def test_language_evaluate(nlp): text = "hello world" annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}} diff --git a/spacy/training/loop.py b/spacy/training/loop.py index c737d7c01..587a2516c 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -210,7 +210,7 @@ def train_while_improving( subbatch, drop=dropout, losses=losses, - sgd=None, + sgd=False, exclude=exclude, annotates=annotating_components, ) diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx index c25bfcee5..5cd9e4af8 100644 --- a/website/docs/api/language.mdx +++ b/website/docs/api/language.mdx @@ -323,15 +323,15 @@ and custom registered functions if needed. See the > nlp.update([example], sgd=optimizer) > ``` -| Name | Description | -| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ | -| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. Defaults to `0.0`. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ | +| `losses` | Dictionary to update with the loss, keyed by pipeline component. Defaults to `None`. ~~Optional[Dict[str, float]]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Language.distill {id="distill",tag="method,experimental",version="4"} From 5d0f48fe69223a8bb95a81734ef61a6d3c2aefa0 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 6 Apr 2023 16:01:59 +0200 Subject: [PATCH 4/4] Enforce that Span.start/end(_char) remain valid and in sync (#12268) * Enforce that Span.start/end(_char) remain valid and in sync Allowing span attributes to be writable starting in v3 has made it possible for the internal `Span.start/end/start_char/end_char` to get out-of-sync or have invalid values. This checks that the values are valid and syncs the token and char offsets if any attributes are modified directly. It does not yet handle the case where the underlying doc is modified. * Format --- spacy/errors.py | 5 +++- spacy/tests/doc/test_span.py | 47 ++++++++++++++++++++++++++++++++++ spacy/tokens/span.pyx | 49 +++++++++++++++++++++++++++--------- 3 files changed, 88 insertions(+), 13 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index e1f7e7400..5c9f41b7e 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -926,7 +926,7 @@ class Errors(metaclass=ErrorsWithCodes): E1029 = ("Edit tree cannot be applied to form.") E1030 = ("Edit tree identifier out of range.") E1031 = ("Could not find gold transition - see logs above.") - E1032 = ("`{var}` should not be {forbidden}, but received {value}.") + E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.") E1033 = ("Dimension {name} invalid -- only nO, nF, nP") E1034 = ("Node index {i} out of bounds ({length})") E1035 = ("Token index {i} out of bounds ({length})") @@ -966,6 +966,9 @@ class Errors(metaclass=ErrorsWithCodes): E4004 = ("Backprop is not supported when is_train is not set.") E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.") + E4007 = ("Span {var} {value} must be {op} Span {existing_var} " + "{existing_value}.") + E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.") RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index a99f8b561..8eabcd645 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -707,3 +707,50 @@ def test_span_ent_id(en_tokenizer): doc.ents = [span] assert doc.ents[0].ent_id_ == "ID2" assert doc[1].ent_id_ == "ID2" + + +def test_span_start_end_sync(en_tokenizer): + doc = en_tokenizer("a bc def e fghij kl") + # can create and edit span starts/ends + span = doc[2:4] + span.start_char = 2 + span.end = 5 + assert span == doc[span.start : span.end] + assert span == doc.char_span(span.start_char, span.end_char) + # cannot set completely out of bounds starts/ends + with pytest.raises(IndexError): + span.start = -1 + with pytest.raises(IndexError): + span.end = -1 + with pytest.raises(IndexError): + span.start_char = len(doc.text) + 1 + with pytest.raises(IndexError): + span.end = len(doc.text) + 1 + # test all possible char starts/ends + span = doc[0 : len(doc)] + token_char_starts = [token.idx for token in doc] + token_char_ends = [token.idx + len(token.text) for token in doc] + for i in range(len(doc.text)): + if i not in token_char_starts: + with pytest.raises(ValueError): + span.start_char = i + else: + span.start_char = i + span = doc[0 : len(doc)] + for i in range(len(doc.text)): + if i not in token_char_ends: + with pytest.raises(ValueError): + span.end_char = i + else: + span.end_char = i + # start must be <= end + span = doc[1:3] + with pytest.raises(ValueError): + span.start = 4 + with pytest.raises(ValueError): + span.end = 0 + span = doc.char_span(2, 8) + with pytest.raises(ValueError): + span.start_char = 9 + with pytest.raises(ValueError): + span.end_char = 1 diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 75f7db7ca..3d64a24a5 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -772,36 +772,61 @@ cdef class Span: return self.span_c().start def __set__(self, int start): - if start < 0: - raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start)) - self.span_c().start = start + if start < 0 or start > self.doc.length: + raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start)) + cdef SpanC* span_c = self.span_c() + if start > span_c.end: + raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end)) + span_c.start = start + span_c.start_char = self.doc.c[start].idx property end: def __get__(self): return self.span_c().end def __set__(self, int end): - if end < 0: - raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end)) - self.span_c().end = end + if end < 0 or end > self.doc.length: + raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end)) + cdef SpanC* span_c = self.span_c() + if span_c.start > end: + raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start)) + span_c.end = end + if end > 0: + span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length + else: + span_c.end_char = 0 property start_char: def __get__(self): return self.span_c().start_char def __set__(self, int start_char): - if start_char < 0: - raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char)) - self.span_c().start_char = start_char + if start_char < 0 or start_char > len(self.doc.text): + raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char)) + cdef int start = token_by_start(self.doc.c, self.doc.length, start_char) + if start < 0: + raise ValueError(Errors.E4008.format(value=start_char, pos="start")) + cdef SpanC* span_c = self.span_c() + if start_char > span_c.end_char: + raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char)) + span_c.start_char = start_char + span_c.start = start property end_char: def __get__(self): return self.span_c().end_char def __set__(self, int end_char): - if end_char < 0: - raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char)) - self.span_c().end_char = end_char + if end_char < 0 or end_char > len(self.doc.text): + raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char)) + cdef int end = token_by_end(self.doc.c, self.doc.length, end_char) + if end < 0: + raise ValueError(Errors.E4008.format(value=end_char, pos="end")) + cdef SpanC* span_c = self.span_c() + if span_c.start_char > end_char: + raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char)) + span_c.end_char = end_char + span_c.end = end property label: def __get__(self):