mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Merge branch 'v4' into feature/docwise-generator-batching
# Conflicts: # spacy/kb/kb.pyx # spacy/ml/models/entity_linker.py # spacy/pipeline/entity_linker.py # website/docs/api/inmemorylookupkb.mdx # website/docs/api/kb.mdx
This commit is contained in:
commit
49747697a2
|
@ -926,7 +926,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E1029 = ("Edit tree cannot be applied to form.")
|
E1029 = ("Edit tree cannot be applied to form.")
|
||||||
E1030 = ("Edit tree identifier out of range.")
|
E1030 = ("Edit tree identifier out of range.")
|
||||||
E1031 = ("Could not find gold transition - see logs above.")
|
E1031 = ("Could not find gold transition - see logs above.")
|
||||||
E1032 = ("`{var}` should not be {forbidden}, but received {value}.")
|
E1032 = ("Span {var} {value} is out of bounds for {obj} with length {length}.")
|
||||||
E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
|
E1033 = ("Dimension {name} invalid -- only nO, nF, nP")
|
||||||
E1034 = ("Node index {i} out of bounds ({length})")
|
E1034 = ("Node index {i} out of bounds ({length})")
|
||||||
E1035 = ("Token index {i} out of bounds ({length})")
|
E1035 = ("Token index {i} out of bounds ({length})")
|
||||||
|
@ -966,6 +966,9 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E4004 = ("Backprop is not supported when is_train is not set.")
|
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||||
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
|
E4005 = ("EntityLinker_v1 is not supported in spaCy v4. Update your configuration.")
|
||||||
E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
|
E4006 = ("Expected `entity_id` to be of type {exp_type}, but is of type {found_type}.")
|
||||||
|
E4007 = ("Span {var} {value} must be {op} Span {existing_var} "
|
||||||
|
"{existing_value}.")
|
||||||
|
E4008 = ("Span {pos}_char {value} does not correspond to a token {pos}.")
|
||||||
|
|
||||||
|
|
||||||
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
|
||||||
|
|
|
@ -2,5 +2,4 @@ from .kb import KnowledgeBase
|
||||||
from .kb_in_memory import InMemoryLookupKB
|
from .kb_in_memory import InMemoryLookupKB
|
||||||
from .candidate import Candidate, InMemoryCandidate
|
from .candidate import Candidate, InMemoryCandidate
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
__all__ = ["KnowledgeBase", "InMemoryLookupKB", "Candidate", "InMemoryCandidate"]
|
||||||
|
|
|
@ -43,18 +43,6 @@ cdef class KnowledgeBase:
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_candidates(self, mention: Span) -> Iterable[Candidate]:
|
|
||||||
"""
|
|
||||||
Return candidate entities for specified text. Each candidate defines the entity, the original alias,
|
|
||||||
and the prior probability of that alias resolving to that entity.
|
|
||||||
If the no candidate is found for a given text, an empty list is returned.
|
|
||||||
mention (Span): Mention for which to get candidates.
|
|
||||||
RETURNS (Iterable[Candidate]): Identified candidates.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError(
|
|
||||||
Errors.E1045.format(parent="KnowledgeBase", method="get_candidates", name=self.__name__)
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
def get_vectors(self, entities: Iterable[str]) -> Iterable[Iterable[float]]:
|
||||||
"""
|
"""
|
||||||
Return vectors for entities.
|
Return vectors for entities.
|
||||||
|
|
|
@ -1202,7 +1202,7 @@ class Language:
|
||||||
_: Optional[Any] = None,
|
_: Optional[Any] = None,
|
||||||
*,
|
*,
|
||||||
drop: float = 0.0,
|
drop: float = 0.0,
|
||||||
sgd: Optional[Optimizer] = None,
|
sgd: Union[Optimizer, None, Literal[False]] = None,
|
||||||
losses: Optional[Dict[str, float]] = None,
|
losses: Optional[Dict[str, float]] = None,
|
||||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||||
exclude: Iterable[str] = SimpleFrozenList(),
|
exclude: Iterable[str] = SimpleFrozenList(),
|
||||||
|
@ -1213,7 +1213,9 @@ class Language:
|
||||||
examples (Iterable[Example]): A batch of examples
|
examples (Iterable[Example]): A batch of examples
|
||||||
_: Should not be set - serves to catch backwards-incompatible scripts.
|
_: Should not be set - serves to catch backwards-incompatible scripts.
|
||||||
drop (float): The dropout rate.
|
drop (float): The dropout rate.
|
||||||
sgd (Optimizer): An optimizer.
|
sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
|
||||||
|
be created via create_optimizer if 'None'. No optimizer will
|
||||||
|
be used when set to 'False'.
|
||||||
losses (Dict[str, float]): Dictionary to update with the loss, keyed by
|
losses (Dict[str, float]): Dictionary to update with the loss, keyed by
|
||||||
component.
|
component.
|
||||||
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
||||||
|
@ -1272,6 +1274,7 @@ class Language:
|
||||||
name not in exclude
|
name not in exclude
|
||||||
and isinstance(proc, ty.TrainableComponent)
|
and isinstance(proc, ty.TrainableComponent)
|
||||||
and proc.is_trainable
|
and proc.is_trainable
|
||||||
|
and sgd not in (None, False)
|
||||||
):
|
):
|
||||||
proc.finish_update(sgd)
|
proc.finish_update(sgd)
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,6 @@ from thinc.api import CosineDistance, Model, Optimizer, Config
|
||||||
from thinc.api import set_dropout_rate
|
from thinc.api import set_dropout_rate
|
||||||
|
|
||||||
from ..kb import KnowledgeBase, Candidate
|
from ..kb import KnowledgeBase, Candidate
|
||||||
from ..ml import empty_kb
|
|
||||||
from ..tokens import Doc, SpanGroup
|
from ..tokens import Doc, SpanGroup
|
||||||
from .pipe import deserialize_config
|
from .pipe import deserialize_config
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe import TrainablePipe
|
||||||
|
|
|
@ -707,3 +707,50 @@ def test_span_ent_id(en_tokenizer):
|
||||||
doc.ents = [span]
|
doc.ents = [span]
|
||||||
assert doc.ents[0].ent_id_ == "ID2"
|
assert doc.ents[0].ent_id_ == "ID2"
|
||||||
assert doc[1].ent_id_ == "ID2"
|
assert doc[1].ent_id_ == "ID2"
|
||||||
|
|
||||||
|
|
||||||
|
def test_span_start_end_sync(en_tokenizer):
|
||||||
|
doc = en_tokenizer("a bc def e fghij kl")
|
||||||
|
# can create and edit span starts/ends
|
||||||
|
span = doc[2:4]
|
||||||
|
span.start_char = 2
|
||||||
|
span.end = 5
|
||||||
|
assert span == doc[span.start : span.end]
|
||||||
|
assert span == doc.char_span(span.start_char, span.end_char)
|
||||||
|
# cannot set completely out of bounds starts/ends
|
||||||
|
with pytest.raises(IndexError):
|
||||||
|
span.start = -1
|
||||||
|
with pytest.raises(IndexError):
|
||||||
|
span.end = -1
|
||||||
|
with pytest.raises(IndexError):
|
||||||
|
span.start_char = len(doc.text) + 1
|
||||||
|
with pytest.raises(IndexError):
|
||||||
|
span.end = len(doc.text) + 1
|
||||||
|
# test all possible char starts/ends
|
||||||
|
span = doc[0 : len(doc)]
|
||||||
|
token_char_starts = [token.idx for token in doc]
|
||||||
|
token_char_ends = [token.idx + len(token.text) for token in doc]
|
||||||
|
for i in range(len(doc.text)):
|
||||||
|
if i not in token_char_starts:
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span.start_char = i
|
||||||
|
else:
|
||||||
|
span.start_char = i
|
||||||
|
span = doc[0 : len(doc)]
|
||||||
|
for i in range(len(doc.text)):
|
||||||
|
if i not in token_char_ends:
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span.end_char = i
|
||||||
|
else:
|
||||||
|
span.end_char = i
|
||||||
|
# start must be <= end
|
||||||
|
span = doc[1:3]
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span.start = 4
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span.end = 0
|
||||||
|
span = doc.char_span(2, 8)
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span.start_char = 9
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
span.end_char = 1
|
||||||
|
|
|
@ -157,6 +157,24 @@ def test_language_update_updates():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_language_update_does_not_update_with_sgd_false():
|
||||||
|
config = Config().from_str(TAGGER_CFG_STRING)
|
||||||
|
nlp = load_model_from_config(config, auto_fill=True, validate=True)
|
||||||
|
|
||||||
|
train_examples = []
|
||||||
|
for t in TAGGER_TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
||||||
|
nlp.update(train_examples, sgd=False)
|
||||||
|
docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
|
||||||
|
|
||||||
|
xp = get_array_module(docs_after_update[0].tensor)
|
||||||
|
xp.testing.assert_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
|
||||||
|
|
||||||
|
|
||||||
def test_language_evaluate(nlp):
|
def test_language_evaluate(nlp):
|
||||||
text = "hello world"
|
text = "hello world"
|
||||||
annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
|
annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
|
||||||
|
|
|
@ -772,36 +772,61 @@ cdef class Span:
|
||||||
return self.span_c().start
|
return self.span_c().start
|
||||||
|
|
||||||
def __set__(self, int start):
|
def __set__(self, int start):
|
||||||
if start < 0:
|
if start < 0 or start > self.doc.length:
|
||||||
raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start))
|
raise IndexError(Errors.E1032.format(var="start", obj="Doc", length=self.doc.length, value=start))
|
||||||
self.span_c().start = start
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
if start > span_c.end:
|
||||||
|
raise ValueError(Errors.E4007.format(var="start", value=start, op="<=", existing_var="end", existing_value=span_c.end))
|
||||||
|
span_c.start = start
|
||||||
|
span_c.start_char = self.doc.c[start].idx
|
||||||
|
|
||||||
property end:
|
property end:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.span_c().end
|
return self.span_c().end
|
||||||
|
|
||||||
def __set__(self, int end):
|
def __set__(self, int end):
|
||||||
if end < 0:
|
if end < 0 or end > self.doc.length:
|
||||||
raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end))
|
raise IndexError(Errors.E1032.format(var="end", obj="Doc", length=self.doc.length, value=end))
|
||||||
self.span_c().end = end
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
if span_c.start > end:
|
||||||
|
raise ValueError(Errors.E4007.format(var="end", value=end, op=">=", existing_var="start", existing_value=span_c.start))
|
||||||
|
span_c.end = end
|
||||||
|
if end > 0:
|
||||||
|
span_c.end_char = self.doc.c[end-1].idx + self.doc.c[end-1].lex.length
|
||||||
|
else:
|
||||||
|
span_c.end_char = 0
|
||||||
|
|
||||||
property start_char:
|
property start_char:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.span_c().start_char
|
return self.span_c().start_char
|
||||||
|
|
||||||
def __set__(self, int start_char):
|
def __set__(self, int start_char):
|
||||||
if start_char < 0:
|
if start_char < 0 or start_char > len(self.doc.text):
|
||||||
raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char))
|
raise IndexError(Errors.E1032.format(var="start_char", obj="Doc text", length=len(self.doc.text), value=start_char))
|
||||||
self.span_c().start_char = start_char
|
cdef int start = token_by_start(self.doc.c, self.doc.length, start_char)
|
||||||
|
if start < 0:
|
||||||
|
raise ValueError(Errors.E4008.format(value=start_char, pos="start"))
|
||||||
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
if start_char > span_c.end_char:
|
||||||
|
raise ValueError(Errors.E4007.format(var="start_char", value=start_char, op="<=", existing_var="end_char", existing_value=span_c.end_char))
|
||||||
|
span_c.start_char = start_char
|
||||||
|
span_c.start = start
|
||||||
|
|
||||||
property end_char:
|
property end_char:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.span_c().end_char
|
return self.span_c().end_char
|
||||||
|
|
||||||
def __set__(self, int end_char):
|
def __set__(self, int end_char):
|
||||||
if end_char < 0:
|
if end_char < 0 or end_char > len(self.doc.text):
|
||||||
raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char))
|
raise IndexError(Errors.E1032.format(var="end_char", obj="Doc text", length=len(self.doc.text), value=end_char))
|
||||||
self.span_c().end_char = end_char
|
cdef int end = token_by_end(self.doc.c, self.doc.length, end_char)
|
||||||
|
if end < 0:
|
||||||
|
raise ValueError(Errors.E4008.format(value=end_char, pos="end"))
|
||||||
|
cdef SpanC* span_c = self.span_c()
|
||||||
|
if span_c.start_char > end_char:
|
||||||
|
raise ValueError(Errors.E4007.format(var="end_char", value=end_char, op=">=", existing_var="start_char", existing_value=span_c.start_char))
|
||||||
|
span_c.end_char = end_char
|
||||||
|
span_c.end = end
|
||||||
|
|
||||||
property label:
|
property label:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -210,7 +210,7 @@ def train_while_improving(
|
||||||
subbatch,
|
subbatch,
|
||||||
drop=dropout,
|
drop=dropout,
|
||||||
losses=losses,
|
losses=losses,
|
||||||
sgd=None,
|
sgd=False,
|
||||||
exclude=exclude,
|
exclude=exclude,
|
||||||
annotates=annotating_components,
|
annotates=annotating_components,
|
||||||
)
|
)
|
||||||
|
|
|
@ -323,15 +323,15 @@ and custom registered functions if needed. See the
|
||||||
> nlp.update([example], sgd=optimizer)
|
> nlp.update([example], sgd=optimizer)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `drop` | The dropout rate. ~~float~~ |
|
| `drop` | The dropout rate. Defaults to `0.0`. ~~float~~ |
|
||||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if `None`. No optimizer will be used when set to `False`. Defaults to `None`. ~~Union[Optimizer, None, Literal[False]]~~ |
|
||||||
| `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ |
|
| `losses` | Dictionary to update with the loss, keyed by pipeline component. Defaults to `None`. ~~Optional[Dict[str, float]]~~ |
|
||||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
## Language.distill {id="distill",tag="method,experimental",version="4"}
|
## Language.distill {id="distill",tag="method,experimental",version="4"}
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,13 @@ Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values instead of
|
||||||
integer IDs. This ensures that strings always map to the same ID, even from
|
integer IDs. This ensures that strings always map to the same ID, even from
|
||||||
different `StringStores`.
|
different `StringStores`.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
Note that a `StringStore` instance is not static. It increases in size as texts
|
||||||
|
with new tokens are processed.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## StringStore.\_\_init\_\_ {id="init",tag="method"}
|
## StringStore.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
Create the `StringStore`.
|
Create the `StringStore`.
|
||||||
|
|
|
@ -10,6 +10,13 @@ The `Vocab` object provides a lookup table that allows you to access
|
||||||
[`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
|
[`StringStore`](/api/stringstore). It also owns underlying C-data that is shared
|
||||||
between `Doc` objects.
|
between `Doc` objects.
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
Note that a `Vocab` instance is not static. It increases in size as texts with
|
||||||
|
new tokens are processed.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
## Vocab.\_\_init\_\_ {id="init",tag="method"}
|
## Vocab.\_\_init\_\_ {id="init",tag="method"}
|
||||||
|
|
||||||
Create the vocabulary.
|
Create the vocabulary.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user