diff --git a/spacy/errors.py b/spacy/errors.py index c6a3e8161..5d0194ed6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -926,7 +926,7 @@ class Errors(metaclass=ErrorsWithCodes): E1029 = ("Edit tree cannot be applied to form.") E1030 = ("Edit tree identifier out of range.") E1031 = ("Could not find gold transition - see logs above.") - E1032 = ("`{var}` should not be {forbidden}, but received {value}.") + E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.") E1033 = ("Dimension {name} invalid -- only nO, nF, nP") E1034 = ("Node index {i} out of bounds ({length})") E1035 = ("Token index {i} out of bounds ({length})") @@ -966,6 +966,9 @@ class Errors(metaclass=ErrorsWithCodes): E4004 = ("Backprop is not supported when is_train is not set.") E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.") E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.") + E4007 = ("Span {var} {value} must be {op} Span {existing_var} " + "{existing_value}.") + E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.") RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"} diff --git a/spacy/kb/__init__.py b/spacy/kb/__init__.py index ff0e209e3..c8a657d62 100644 --- a/spacy/kb/__init__.py +++ b/spacy/kb/__init__.py @@ -2,5 +2,4 @@ from .kb import KnowledgeBase from .kb_in_memory import InMemoryLookupKB from .candidate import Candidate, InMemoryCandidate - __all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"] diff --git a/spacy/kb/kb.pyx b/spacy/kb/kb.pyx index 1de8932a7..7b76373c8 100644 --- a/spacy/kb/kb.pyx +++ b/spacy/kb/kb.pyx @@ -43,18 +43,6 @@ cdef class KnowledgeBase: Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__) ) - def get_candidates(self, mention: Span) -> Iterable[Candidate]: - """ - Return candidate entities for specified text. Each candidate defines the entity, the original alias, - and the prior probability of that alias resolving to that entity. - If the no candidate is found for a given text, an empty list is returned. - mention (Span): Mention for which to get candidates. - RETURNS (Iterable[Candidate]): Identified candidates. - """ - raise NotImplementedError( - Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__) - ) - def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]: """ Return vectors for entities. diff --git a/spacy/language.py b/spacy/language.py index 3b86fdde7..ce3630629 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1202,7 +1202,7 @@ class Language: _: Optional[Any] = None, *, drop: float = 0.0, - sgd: Optional[Optimizer] = None, + sgd: Union[Optimizer, None, Literal[False]] = None, losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, exclude: Iterable[str] = SimpleFrozenList(), @@ -1213,7 +1213,9 @@ class Language: examples (Iterable[Example]): A batch of examples _: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. - sgd (Optimizer): An optimizer. + sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will + be created via create_optimizer if 'None'. No optimizer will + be used when set to 'False'. losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. component_cfg (Dict[str, Dict]): Config parameters for specific pipeline @@ -1272,6 +1274,7 @@ class Language: name not in exclude and isinstance(proc, ty.TrainableComponent) and proc.is_trainable + and sgd not in (None, False) ): proc.finish_update(sgd) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 77c1017b5..2618e3b0e 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -21,7 +21,6 @@ from thinc.api import CosineDistance, Model, Optimizer, Config from thinc.api import set_dropout_rate from ..kb import KnowledgeBase, Candidate -from ..ml import empty_kb from ..tokens import Doc, SpanGroup from .pipe import deserialize_config from .trainable_pipe import TrainablePipe diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index a99f8b561..8eabcd645 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -707,3 +707,50 @@ def test_span_ent_id(en_tokenizer): doc.ents = [span] assert doc.ents[0].ent_id_ == "ID2" assert doc[1].ent_id_ == "ID2" + + +def test_span_start_end_sync(en_tokenizer): + doc = en_tokenizer("a bc def e fghij kl") + # can create and edit span starts/ends + span = doc[2:4] + span.start_char = 2 + span.end = 5 + assert span == doc[span.start : span.end] + assert span == doc.char_span(span.start_char, span.end_char) + # cannot set completely out of bounds starts/ends + with pytest.raises(IndexError): + span.start = -1 + with pytest.raises(IndexError): + span.end = -1 + with pytest.raises(IndexError): + span.start_char = len(doc.text) + 1 + with pytest.raises(IndexError): + span.end = len(doc.text) + 1 + # test all possible char starts/ends + span = doc[0 : len(doc)] + token_char_starts = [token.idx for token in doc] + token_char_ends = [token.idx + len(token.text) for token in doc] + for i in range(len(doc.text)): + if i not in token_char_starts: + with pytest.raises(ValueError): + span.start_char = i + else: + span.start_char = i + span = doc[0 : len(doc)] + for i in range(len(doc.text)): + if i not in token_char_ends: + with pytest.raises(ValueError): + span.end_char = i + else: + span.end_char = i + # start must be <= end + span = doc[1:3] + with pytest.raises(ValueError): + span.start = 4 + with pytest.raises(ValueError): + span.end = 0 + span = doc.char_span(2, 8) + with pytest.raises(ValueError): + span.start_char = 9 + with pytest.raises(ValueError): + span.end_char = 1 diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 9b8c7b9c7..08a7d28a4 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -157,6 +157,24 @@ def test_language_update_updates(): ) +def test_language_update_does_not_update_with_sgd_false(): + config = Config().from_str(TAGGER_CFG_STRING) + nlp = load_model_from_config(config, auto_fill=True, validate=True) + + train_examples = [] + for t in TAGGER_TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + + nlp.initialize(get_examples=lambda: train_examples) + + docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + nlp.update(train_examples, sgd=False) + docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples])) + + xp = get_array_module(docs_after_update[0].tensor) + xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor) + + def test_language_evaluate(nlp): text = "hello world" annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}} diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 75f7db7ca..3d64a24a5 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -772,36 +772,61 @@ cdef class Span: return self.span_c().start def __set__(self, int start): - if start < 0: - raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start)) - self.span_c().start = start + if start < 0 or start > self.doc.length: + raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start)) + cdef SpanC* span_c = self.span_c() + if start > span_c.end: + raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end)) + span_c.start = start + span_c.start_char = self.doc.c[start].idx property end: def __get__(self): return self.span_c().end def __set__(self, int end): - if end < 0: - raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end)) - self.span_c().end = end + if end < 0 or end > self.doc.length: + raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end)) + cdef SpanC* span_c = self.span_c() + if span_c.start > end: + raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start)) + span_c.end = end + if end > 0: + span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length + else: + span_c.end_char = 0 property start_char: def __get__(self): return self.span_c().start_char def __set__(self, int start_char): - if start_char < 0: - raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char)) - self.span_c().start_char = start_char + if start_char < 0 or start_char > len(self.doc.text): + raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char)) + cdef int start = token_by_start(self.doc.c, self.doc.length, start_char) + if start < 0: + raise ValueError(Errors.E4008.format(value=start_char, pos="start")) + cdef SpanC* span_c = self.span_c() + if start_char > span_c.end_char: + raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char)) + span_c.start_char = start_char + span_c.start = start property end_char: def __get__(self): return self.span_c().end_char def __set__(self, int end_char): - if end_char < 0: - raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char)) - self.span_c().end_char = end_char + if end_char < 0 or end_char > len(self.doc.text): + raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char)) + cdef int end = token_by_end(self.doc.c, self.doc.length, end_char) + if end < 0: + raise ValueError(Errors.E4008.format(value=end_char, pos="end")) + cdef SpanC* span_c = self.span_c() + if span_c.start_char > end_char: + raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char)) + span_c.end_char = end_char + span_c.end = end property label: def __get__(self): diff --git a/spacy/training/loop.py b/spacy/training/loop.py index c737d7c01..587a2516c 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -210,7 +210,7 @@ def train_while_improving( subbatch, drop=dropout, losses=losses, - sgd=None, + sgd=False, exclude=exclude, annotates=annotating_components, ) diff --git a/website/docs/api/language.mdx b/website/docs/api/language.mdx index c25bfcee5..5cd9e4af8 100644 --- a/website/docs/api/language.mdx +++ b/website/docs/api/language.mdx @@ -323,15 +323,15 @@ and custom registered functions if needed. See the > nlp.update([example], sgd=optimizer) > ``` -| Name | Description | -| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ | -| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. Defaults to `0.0`. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ | +| `losses` | Dictionary to update with the loss, keyed by pipeline component. Defaults to `None`. ~~Optional[Dict[str, float]]~~ | +| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Language.distill {id="distill",tag="method,experimental",version="4"} diff --git a/website/docs/api/stringstore.mdx b/website/docs/api/stringstore.mdx index 7e380f5f8..2425c8adc 100644 --- a/website/docs/api/stringstore.mdx +++ b/website/docs/api/stringstore.mdx @@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of integer IDs. This ensures that strings always map to the same ID, even from different `StringStores`. + + +Note that a `StringStore` instance is not static. It increases in size as texts +with new tokens are processed. + + + ## StringStore.\_\_init\_\_ {id="init",tag="method"} Create the `StringStore`. diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 304040f9c..1e32eb118 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access [`StringStore`](/api/stringstore). It also owns underlying C-data that is shared between `Doc` objects. + + +Note that a `Vocab` instance is not static. It increases in size as texts with +new tokens are processed. + + + ## Vocab.\_\_init\_\_ {id="init",tag="method"} Create the vocabulary.