Merge branch 'v4' into feature/docwise-generator-batching

# Conflicts:
#	spacy/kb/kb.pyx
#	spacy/ml/models/entity_linker.py
#	spacy/pipeline/entity_linker.py
#	website/docs/api/inmemorylookupkb.mdx
#	website/docs/api/kb.mdx
This commit is contained in:
Raphael Mitsch 2023-04-17 16:28:09 +02:00
commit 49747697a2
12 changed files with 135 additions and 39 deletions

View File

@ -926,7 +926,7 @@ class Errors(metaclass=ErrorsWithCodes):
E1029 = ("Edit tree cannot be applied to form.")
E1030 = ("Edit tree identifier out of range.")
E1031 = ("Could not find gold transition - see logs above.")
E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
E1034 = ("Node index {i} out of bounds ({length})")
E1035 = ("Token index {i} out of bounds ({length})")
@ -966,6 +966,9 @@ class Errors(metaclass=ErrorsWithCodes):
E4004 = ("Backprop is not supported when is_train is not set.")
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
"{existing_value}.")
E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}

View File

@ -2,5 +2,4 @@ from .kb import KnowledgeBase
from .kb_in_memory import InMemoryLookupKB
from .candidate import Candidate, InMemoryCandidate
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]

View File

@ -43,18 +43,6 @@ cdef class KnowledgeBase:
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
)
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
"""
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
and the prior probability of that alias resolving to that entity.
If the no candidate is found for a given text, an empty list is returned.
mention (Span): Mention for which to get candidates.
RETURNS (Iterable[Candidate]): Identified candidates.
"""
raise NotImplementedError(
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
)
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
"""
Return vectors for entities.

View File

@ -1202,7 +1202,7 @@ class Language:
_: Optional[Any] = None,
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
sgd: Union[Optimizer, None, Literal[False]] = None,
losses: Optional[Dict[str, float]] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
exclude: Iterable[str] = SimpleFrozenList(),
@ -1213,7 +1213,9 @@ class Language:
examples (Iterable[Example]): A batch of examples
_: Should not be set - serves to catch backwards-incompatible scripts.
drop (float): The dropout rate.
sgd (Optimizer): An optimizer.
sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
be created via create_optimizer if 'None'. No optimizer will
be used when set to 'False'.
losses (Dict[str, float]): Dictionary to update with the loss, keyed by
component.
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
@ -1272,6 +1274,7 @@ class Language:
name not in exclude
and isinstance(proc, ty.TrainableComponent)
and proc.is_trainable
and sgd not in (None, False)
):
proc.finish_update(sgd)

View File

@ -21,7 +21,6 @@ from thinc.api import CosineDistance, Model, Optimizer, Config
from thinc.api import set_dropout_rate
from ..kb import KnowledgeBase, Candidate
from ..ml import empty_kb
from ..tokens import Doc, SpanGroup
from .pipe import deserialize_config
from .trainable_pipe import TrainablePipe

View File

@ -707,3 +707,50 @@ def test_span_ent_id(en_tokenizer):
doc.ents = [span]
assert doc.ents[0].ent_id_ == "ID2"
assert doc[1].ent_id_ == "ID2"
def test_span_start_end_sync(en_tokenizer):
doc = en_tokenizer("a bc def e fghij kl")
# can create and edit span starts/ends
span = doc[2:4]
span.start_char = 2
span.end = 5
assert span == doc[span.start : span.end]
assert span == doc.char_span(span.start_char, span.end_char)
# cannot set completely out of bounds starts/ends
with pytest.raises(IndexError):
span.start = -1
with pytest.raises(IndexError):
span.end = -1
with pytest.raises(IndexError):
span.start_char = len(doc.text) + 1
with pytest.raises(IndexError):
span.end = len(doc.text) + 1
# test all possible char starts/ends
span = doc[0 : len(doc)]
token_char_starts = [token.idx for token in doc]
token_char_ends = [token.idx + len(token.text) for token in doc]
for i in range(len(doc.text)):
if i not in token_char_starts:
with pytest.raises(ValueError):
span.start_char = i
else:
span.start_char = i
span = doc[0 : len(doc)]
for i in range(len(doc.text)):
if i not in token_char_ends:
with pytest.raises(ValueError):
span.end_char = i
else:
span.end_char = i
# start must be <= end
span = doc[1:3]
with pytest.raises(ValueError):
span.start = 4
with pytest.raises(ValueError):
span.end = 0
span = doc.char_span(2, 8)
with pytest.raises(ValueError):
span.start_char = 9
with pytest.raises(ValueError):
span.end_char = 1

View File

@ -157,6 +157,24 @@ def test_language_update_updates():
)
def test_language_update_does_not_update_with_sgd_false():
config = Config().from_str(TAGGER_CFG_STRING)
nlp = load_model_from_config(config, auto_fill=True, validate=True)
train_examples = []
for t in TAGGER_TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.initialize(get_examples=lambda: train_examples)
docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
nlp.update(train_examples, sgd=False)
docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
xp = get_array_module(docs_after_update[0].tensor)
xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
def test_language_evaluate(nlp):
text = "hello world"
annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}

View File

@ -772,36 +772,61 @@ cdef class Span:
return self.span_c().start
def __set__(self, int start):
if start < 0:
raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
self.span_c().start = start
if start < 0 or start > self.doc.length:
raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
cdef SpanC* span_c = self.span_c()
if start > span_c.end:
raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
span_c.start = start
span_c.start_char = self.doc.c[start].idx
property end:
def __get__(self):
return self.span_c().end
def __set__(self, int end):
if end < 0:
raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
self.span_c().end = end
if end < 0 or end > self.doc.length:
raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
cdef SpanC* span_c = self.span_c()
if span_c.start > end:
raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
span_c.end = end
if end > 0:
span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
else:
span_c.end_char = 0
property start_char:
def __get__(self):
return self.span_c().start_char
def __set__(self, int start_char):
if start_char < 0:
raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
self.span_c().start_char = start_char
if start_char < 0 or start_char > len(self.doc.text):
raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
if start < 0:
raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
cdef SpanC* span_c = self.span_c()
if start_char > span_c.end_char:
raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
span_c.start_char = start_char
span_c.start = start
property end_char:
def __get__(self):
return self.span_c().end_char
def __set__(self, int end_char):
if end_char < 0:
raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
self.span_c().end_char = end_char
if end_char < 0 or end_char > len(self.doc.text):
raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
if end < 0:
raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
cdef SpanC* span_c = self.span_c()
if span_c.start_char > end_char:
raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
span_c.end_char = end_char
span_c.end = end
property label:
def __get__(self):

View File

@ -210,7 +210,7 @@ def train_while_improving(
subbatch,
drop=dropout,
losses=losses,
sgd=None,
sgd=False,
exclude=exclude,
annotates=annotating_components,
)

View File

@ -323,15 +323,15 @@ and custom registered functions if needed. See the
> nlp.update([example], sgd=optimizer)
> ```
| Name | Description |
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ |
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
| Name | Description |
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | |
| `drop` | The dropout rate. Defaults to `0.0`. ~~float~~ |
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ |
| `losses` | Dictionary to update with the loss, keyed by pipeline component. Defaults to `None`. ~~Optional[Dict[str, float]]~~ |
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Language.distill {id="distill",tag="method,experimental",version="4"}

View File

@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
integer IDs. This ensures that strings always map to the same ID, even from
different `StringStores`.
<Infobox variant="warning">
Note that a `StringStore` instance is not static. It increases in size as texts
with new tokens are processed.
</Infobox>
## StringStore.\_\_init\_\_ {id="init",tag="method"}
Create the `StringStore`.

View File

@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
[`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
between `Doc` objects.
<Infobox variant="warning">
Note that a `Vocab` instance is not static. It increases in size as texts with
new tokens are processed.
</Infobox>
## Vocab.\_\_init\_\_ {id="init",tag="method"}
Create the vocabulary.