Merge branch 'develop' into feature/replace-listeners

This commit is contained in:
Ines Montani 2021-01-29 15:57:32 +11:00
commit 01ecfbcc45
23 changed files with 388 additions and 204 deletions

View File

@ -132,6 +132,11 @@ class Warnings:
"'morphologizer'.") "'morphologizer'.")
W109 = ("Unable to save user hooks while serializing the doc. Re-add any " W109 = ("Unable to save user hooks while serializing the doc. Re-add any "
"required user hooks to the doc after processing.") "required user hooks to the doc after processing.")
W110 = ("The DependencyMatcher token pattern {pattern} matched a span "
"{tokens} that is 2+ tokens long. Only the first token in the span "
"will be included in the results. For better results, token "
"patterns should return matches that are each exactly one token "
"long.")
@add_codes @add_codes
@ -470,6 +475,10 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E890 = ("Can not add the alias '{alias}' to the Knowledge base. "
"Each alias should be a meaningful string.")
E891 = ("Alias '{alias}' could not be added to the Knowledge base. "
"This is likely a bug in spaCy.")
E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}") E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. " E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
"If you're using a custom function, make sure the code is available. " "If you're using a custom function, make sure the code is available. "
@ -747,6 +756,10 @@ class Errors:
"file.json .`.") "file.json .`.")
E1015 = ("Can't initialize model from config: no {value} found. For more " E1015 = ("Can't initialize model from config: no {value} found. For more "
"information, run: python -m spacy debug config config.cfg") "information, run: python -m spacy debug config config.cfg")
E1016 = ("The operators 'OP': '?', '*', and '+' are not supported in "
"DependencyMatcher token patterns. The token pattern in "
"RIGHT_ATTR should return matches that are each exactly one token "
"long. Invalid pattern:\n{node}")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -187,6 +187,10 @@ cdef class KnowledgeBase:
For a given alias, add its potential entities and prior probabilies to the KB. For a given alias, add its potential entities and prior probabilies to the KB.
Return the alias_hash at the end Return the alias_hash at the end
""" """
if alias is None or len(alias) == 0:
raise ValueError(Errors.E890.format(alias=alias))
previous_alias_nr = self.get_size_aliases()
# Throw an error if the length of entities and probabilities are not the same # Throw an error if the length of entities and probabilities are not the same
if not len(entities) == len(probabilities): if not len(entities) == len(probabilities):
raise ValueError(Errors.E132.format(alias=alias, raise ValueError(Errors.E132.format(alias=alias,
@ -220,6 +224,8 @@ cdef class KnowledgeBase:
new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs)
self._alias_index[alias_hash] = new_index self._alias_index[alias_hash] = new_index
if previous_alias_nr + 1 != self.get_size_aliases():
raise RuntimeError(Errors.E891.format(alias=alias))
return alias_hash return alias_hash
def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False): def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):

View File

@ -20,7 +20,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .training import Example, validate_examples from .training import Example, validate_examples
from .training.initialize import init_vocab, init_tok2vec from .training.initialize import init_vocab, init_tok2vec
from .scorer import Scorer from .scorer import Scorer
from .util import registry, SimpleFrozenList, _pipe from .util import registry, SimpleFrozenList, _pipe, raise_error
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
@ -176,6 +176,7 @@ class Language:
create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"] create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
self.tokenizer = create_tokenizer(self) self.tokenizer = create_tokenizer(self)
self.batch_size = batch_size self.batch_size = batch_size
self.default_error_handler = raise_error
def __init_subclass__(cls, **kwargs): def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs) super().__init_subclass__(**kwargs)
@ -1022,11 +1023,16 @@ class Language:
continue continue
if not hasattr(proc, "__call__"): if not hasattr(proc, "__call__"):
raise ValueError(Errors.E003.format(component=type(proc), name=name)) raise ValueError(Errors.E003.format(component=type(proc), name=name))
error_handler = self.default_error_handler
if hasattr(proc, "get_error_handler"):
error_handler = proc.get_error_handler()
try: try:
doc = proc(doc, **component_cfg.get(name, {})) doc = proc(doc, **component_cfg.get(name, {}))
except KeyError as e: except KeyError as e:
# This typically happens if a component is not initialized # This typically happens if a component is not initialized
raise ValueError(Errors.E109.format(name=name)) from e raise ValueError(Errors.E109.format(name=name)) from e
except Exception as e:
error_handler(name, proc, [doc], e)
if doc is None: if doc is None:
raise ValueError(Errors.E005.format(name=name)) raise ValueError(Errors.E005.format(name=name))
return doc return doc
@ -1315,6 +1321,26 @@ class Language:
self._optimizer = self.create_optimizer() self._optimizer = self.create_optimizer()
return self._optimizer return self._optimizer
def set_error_handler(
self,
error_handler: Callable[
[str, Callable[[Doc], Doc], List[Doc], Exception], None
],
):
"""Set an error handler object for all the components in the pipeline that implement
a set_error_handler function.
error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], None]):
Function that deals with a failing batch of documents. This callable function should take in
the component's name, the component itself, the offending batch of documents, and the exception
that was thrown.
DOCS: https://nightly.spacy.io/api/language#set_error_handler
"""
self.default_error_handler = error_handler
for name, pipe in self.pipeline:
if hasattr(pipe, "set_error_handler"):
pipe.set_error_handler(error_handler)
def evaluate( def evaluate(
self, self,
examples: Iterable[Example], examples: Iterable[Example],
@ -1334,6 +1360,7 @@ class Language:
arguments for specific components. arguments for specific components.
scorer_cfg (dict): An optional dictionary with extra keyword arguments scorer_cfg (dict): An optional dictionary with extra keyword arguments
for the scorer. for the scorer.
RETURNS (Scorer): The scorer containing the evaluation results. RETURNS (Scorer): The scorer containing the evaluation results.
DOCS: https://nightly.spacy.io/api/language#evaluate DOCS: https://nightly.spacy.io/api/language#evaluate
@ -1358,7 +1385,14 @@ class Language:
kwargs = component_cfg.get(name, {}) kwargs = component_cfg.get(name, {})
kwargs.setdefault("batch_size", batch_size) kwargs.setdefault("batch_size", batch_size)
for doc, eg in zip( for doc, eg in zip(
_pipe((eg.predicted for eg in examples), pipe, kwargs), examples _pipe(
(eg.predicted for eg in examples),
proc=pipe,
name=name,
default_error_handler=self.default_error_handler,
kwargs=kwargs,
),
examples,
): ):
eg.predicted = doc eg.predicted = doc
end_time = timer() end_time = timer()
@ -1463,7 +1497,13 @@ class Language:
kwargs = component_cfg.get(name, {}) kwargs = component_cfg.get(name, {})
# Allow component_cfg to overwrite the top-level kwargs. # Allow component_cfg to overwrite the top-level kwargs.
kwargs.setdefault("batch_size", batch_size) kwargs.setdefault("batch_size", batch_size)
f = functools.partial(_pipe, proc=proc, kwargs=kwargs) f = functools.partial(
_pipe,
proc=proc,
name=name,
kwargs=kwargs,
default_error_handler=self.default_error_handler,
)
pipes.append(f) pipes.append(f)
if n_process != 1: if n_process != 1:

View File

@ -9,8 +9,9 @@ from .matcher cimport Matcher
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..errors import Errors from ..errors import Errors, Warnings
from ..tokens import Span from ..tokens import Span
from ..util import logger
DELIMITER = "||" DELIMITER = "||"
@ -137,6 +138,8 @@ cdef class DependencyMatcher:
raise ValueError(Errors.E1007.format(op=relation["REL_OP"])) raise ValueError(Errors.E1007.format(op=relation["REL_OP"]))
visited_nodes[relation["RIGHT_ID"]] = True visited_nodes[relation["RIGHT_ID"]] = True
visited_nodes[relation["LEFT_ID"]] = True visited_nodes[relation["LEFT_ID"]] = True
if relation["RIGHT_ATTRS"].get("OP", "") in ("?", "*", "+"):
raise ValueError(Errors.E1016.format(node=relation))
idx = idx + 1 idx = idx + 1
def _get_matcher_key(self, key, pattern_idx, token_idx): def _get_matcher_key(self, key, pattern_idx, token_idx):
@ -277,7 +280,9 @@ cdef class DependencyMatcher:
e.g. keys_to_position_maps[root_index][match_id] = [...] e.g. keys_to_position_maps[root_index][match_id] = [...]
""" """
keys_to_position_maps = defaultdict(lambda: defaultdict(list)) keys_to_position_maps = defaultdict(lambda: defaultdict(list))
for match_id, start, _ in self._matcher(doc): for match_id, start, end in self._matcher(doc):
if start + 1 != end:
logger.warning(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
token = doc[start] token = doc[start]
root = ([token] + list(token.ancestors))[-1] root = ([token] + list(token.ancestors))[-1]
keys_to_position_maps[root.i][match_id].append(start) keys_to_position_maps[root.i][match_id].append(start)

View File

@ -96,12 +96,25 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#call DOCS: https://nightly.spacy.io/api/attributeruler#call
""" """
error_handler = self.get_error_handler()
try:
matches = self.match(doc)
self.set_annotations(doc, matches)
return doc
except Exception as e:
error_handler(self.name, self, [doc], e)
def match(self, doc: Doc):
matches = self.matcher(doc, allow_missing=True) matches = self.matcher(doc, allow_missing=True)
# Sort by the attribute ID, so that later rules have precendence # Sort by the attribute ID, so that later rules have precendence
matches = [ matches = [
(int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches
] ]
matches.sort() matches.sort()
return matches
def set_annotations(self, doc, matches):
"""Modify the document in place"""
for attr_id, match_id, start, end in matches: for attr_id, match_id, start, end in matches:
span = Span(doc, start, end, label=match_id) span = Span(doc, start, end, label=match_id)
attrs = self.attrs[attr_id] attrs = self.attrs[attr_id]
@ -121,7 +134,7 @@ class AttributeRuler(Pipe):
) )
) from None ) from None
set_token_attrs(span[index], attrs) set_token_attrs(span[index], attrs)
return doc
def load_from_tag_map( def load_from_tag_map(
self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]] self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]]

View File

@ -1,6 +1,6 @@
from itertools import islice from typing import Optional, Iterable, Callable, Dict, Union, List
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List
from pathlib import Path from pathlib import Path
from itertools import islice
import srsly import srsly
import random import random
from thinc.api import CosineDistance, Model, Optimizer, Config from thinc.api import CosineDistance, Model, Optimizer, Config
@ -276,34 +276,6 @@ class EntityLinker(TrainablePipe):
loss = loss / len(entity_encodings) loss = loss / len(entity_encodings)
return loss, gradients return loss, gradients
def __call__(self, doc: Doc) -> Doc:
"""Apply the pipe to a Doc.
doc (Doc): The document to process.
RETURNS (Doc): The processed Doc.
DOCS: https://nightly.spacy.io/api/entitylinker#call
"""
kb_ids = self.predict([doc])
self.set_annotations([doc], kb_ids)
return doc
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
YIELDS (Doc): Processed documents in order.
DOCS: https://nightly.spacy.io/api/entitylinker#pipe
"""
for docs in util.minibatch(stream, size=batch_size):
kb_ids = self.predict(docs)
self.set_annotations(docs, kb_ids)
yield from docs
def predict(self, docs: Iterable[Doc]) -> List[str]: def predict(self, docs: Iterable[Doc]) -> List[str]:
"""Apply the pipeline's model to a batch of docs, without modifying them. """Apply the pipeline's model to a batch of docs, without modifying them.
Returns the KB IDs for each entity in each doc, including NIL if there is Returns the KB IDs for each entity in each doc, including NIL if there is

View File

@ -135,12 +135,25 @@ class EntityRuler(Pipe):
DOCS: https://nightly.spacy.io/api/entityruler#call DOCS: https://nightly.spacy.io/api/entityruler#call
""" """
error_handler = self.get_error_handler()
try:
matches = self.match(doc)
self.set_annotations(doc, matches)
return doc
except Exception as e:
error_handler(self.name, self, [doc], e)
def match(self, doc: Doc):
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
matches = set( matches = set(
[(m_id, start, end) for m_id, start, end in matches if start != end] [(m_id, start, end) for m_id, start, end in matches if start != end]
) )
get_sort_key = lambda m: (m[2] - m[1], -m[1]) get_sort_key = lambda m: (m[2] - m[1], -m[1])
matches = sorted(matches, key=get_sort_key, reverse=True) matches = sorted(matches, key=get_sort_key, reverse=True)
return matches
def set_annotations(self, doc, matches):
"""Modify the document in place"""
entities = list(doc.ents) entities = list(doc.ents)
new_entities = [] new_entities = []
seen_tokens = set() seen_tokens = set()
@ -163,7 +176,6 @@ class EntityRuler(Pipe):
] ]
seen_tokens.update(range(start, end)) seen_tokens.update(range(start, end))
doc.ents = entities + new_entities doc.ents = entities + new_entities
return doc
@property @property
def labels(self) -> Tuple[str, ...]: def labels(self) -> Tuple[str, ...]:

View File

@ -23,11 +23,7 @@ from .. import util
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(
nlp: Language, nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False
model: Optional[Model],
name: str,
mode: str,
overwrite: bool = False,
): ):
return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
@ -107,10 +103,14 @@ class Lemmatizer(Pipe):
""" """
if not self._validated: if not self._validated:
self._validate_tables(Errors.E1004) self._validate_tables(Errors.E1004)
error_handler = self.get_error_handler()
try:
for token in doc: for token in doc:
if self.overwrite or token.lemma == 0: if self.overwrite or token.lemma == 0:
token.lemma_ = self.lemmatize(token)[0] token.lemma_ = self.lemmatize(token)[0]
return doc return doc
except Exception as e:
error_handler(self.name, self, [doc], e)
def initialize( def initialize(
self, self,
@ -154,21 +154,6 @@ class Lemmatizer(Pipe):
) )
self._validated = True self._validated = True
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
YIELDS (Doc): Processed documents in order.
DOCS: https://nightly.spacy.io/api/lemmatizer#pipe
"""
for doc in stream:
doc = self(doc)
yield doc
def lookup_lemmatize(self, token: Token) -> List[str]: def lookup_lemmatize(self, token: Token) -> List[str]:
"""Lemmatize using a lookup-based approach. """Lemmatize using a lookup-based approach.

View File

@ -1,13 +1,14 @@
# cython: infer_types=True, profile=True # cython: infer_types=True, profile=True
import warnings
from typing import Optional, Tuple, Iterable, Iterator, Callable, Union, Dict from typing import Optional, Tuple, Iterable, Iterator, Callable, Union, Dict
import srsly import srsly
import warnings
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..training import Example from ..training import Example
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..language import Language from ..language import Language
from ..util import raise_error
cdef class Pipe: cdef class Pipe:
"""This class is a base class and not instantiated directly. It provides """This class is a base class and not instantiated directly. It provides
@ -48,9 +49,13 @@ cdef class Pipe:
DOCS: https://nightly.spacy.io/api/pipe#pipe DOCS: https://nightly.spacy.io/api/pipe#pipe
""" """
error_handler = self.get_error_handler()
for doc in stream: for doc in stream:
try:
doc = self(doc) doc = self(doc)
yield doc yield doc
except Exception as e:
error_handler(self.name, self, [doc], e)
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None): def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
"""Initialize the pipe. For non-trainable components, this method """Initialize the pipe. For non-trainable components, this method
@ -98,6 +103,30 @@ cdef class Pipe:
if not self.labels or list(self.labels) == [""]: if not self.labels or list(self.labels) == [""]:
raise ValueError(Errors.E143.format(name=self.name)) raise ValueError(Errors.E143.format(name=self.name))
def set_error_handler(self, error_handler: Callable) -> None:
"""Set an error handler function.
error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], None]):
Function that deals with a failing batch of documents. This callable function should take in
the component's name, the component itself, the offending batch of documents, and the exception
that was thrown.
DOCS: https://nightly.spacy.io/api/pipe#set_error_handler
"""
self.error_handler = error_handler
def get_error_handler(self) -> Optional[Callable]:
"""Retrieve the error handler function.
RETURNS (Callable): The error handler, or if it's not set a default function that just reraises.
DOCS: https://nightly.spacy.io/api/pipe#get_error_handler
"""
if hasattr(self, "error_handler"):
return self.error_handler
return raise_error
def deserialize_config(path): def deserialize_config(path):
if path.exists(): if path.exists():
return srsly.read_json(path) return srsly.read_json(path)

View File

@ -1,16 +1,14 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
import srsly
from typing import Optional, List from typing import Optional, List
import srsly
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from .pipe import Pipe from .pipe import Pipe
from ..language import Language from ..language import Language
from ..scorer import Scorer from ..scorer import Scorer
from ..training import validate_examples from ..training import validate_examples
from .. import util from .. import util
@Language.factory( @Language.factory(
"sentencizer", "sentencizer",
assigns=["token.is_sent_start", "doc.sents"], assigns=["token.is_sent_start", "doc.sents"],
@ -66,6 +64,14 @@ class Sentencizer(Pipe):
DOCS: https://nightly.spacy.io/api/sentencizer#call DOCS: https://nightly.spacy.io/api/sentencizer#call
""" """
error_handler = self.get_error_handler()
try:
self._call(doc)
return doc
except Exception as e:
error_handler(self.name, self, [doc], e)
def _call(self, doc):
start = 0 start = 0
seen_period = False seen_period = False
for i, token in enumerate(doc): for i, token in enumerate(doc):
@ -79,23 +85,6 @@ class Sentencizer(Pipe):
seen_period = True seen_period = True
if start < len(doc): if start < len(doc):
doc[start].is_sent_start = True doc[start].is_sent_start = True
return doc
def pipe(self, stream, batch_size=128):
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
YIELDS (Doc): Processed documents in order.
DOCS: https://nightly.spacy.io/api/sentencizer#pipe
"""
for docs in util.minibatch(stream, size=batch_size):
predictions = self.predict(docs)
self.set_annotations(docs, predictions)
yield from docs
def predict(self, docs): def predict(self, docs):
"""Apply the pipe to a batch of docs, without modifying them. """Apply the pipe to a batch of docs, without modifying them.

View File

@ -1,5 +1,4 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from typing import List
import numpy import numpy
import srsly import srsly
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
@ -95,34 +94,6 @@ class Tagger(TrainablePipe):
"""Data about the labels currently added to the component.""" """Data about the labels currently added to the component."""
return tuple(self.cfg["labels"]) return tuple(self.cfg["labels"])
def __call__(self, doc):
"""Apply the pipe to a Doc.
doc (Doc): The document to process.
RETURNS (Doc): The processed Doc.
DOCS: https://nightly.spacy.io/api/tagger#call
"""
tags = self.predict([doc])
self.set_annotations([doc], tags)
return doc
def pipe(self, stream, *, batch_size=128):
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
YIELDS (Doc): Processed documents in order.
DOCS: https://nightly.spacy.io/api/tagger#pipe
"""
for docs in util.minibatch(stream, size=batch_size):
tag_ids = self.predict(docs)
self.set_annotations(docs, tag_ids)
yield from docs
def predict(self, docs): def predict(self, docs):
"""Apply the pipeline's model to a batch of docs, without modifying them. """Apply the pipeline's model to a batch of docs, without modifying them.

View File

@ -1,5 +1,5 @@
from itertools import islice from itertools import islice
from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any
from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
from thinc.types import Floats2d from thinc.types import Floats2d
import numpy import numpy
@ -9,7 +9,6 @@ from ..language import Language
from ..training import Example, validate_examples, validate_get_examples from ..training import Example, validate_examples, validate_get_examples
from ..errors import Errors from ..errors import Errors
from ..scorer import Scorer from ..scorer import Scorer
from .. import util
from ..tokens import Doc from ..tokens import Doc
from ..vocab import Vocab from ..vocab import Vocab
@ -144,22 +143,6 @@ class TextCategorizer(TrainablePipe):
""" """
return self.labels return self.labels
def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
YIELDS (Doc): Processed documents in order.
DOCS: https://nightly.spacy.io/api/textcategorizer#pipe
"""
for docs in util.minibatch(stream, size=batch_size):
scores = self.predict(docs)
self.set_annotations(docs, scores)
yield from docs
def predict(self, docs: Iterable[Doc]): def predict(self, docs: Iterable[Doc]):
"""Apply the pipeline's model to a batch of docs, without modifying them. """Apply the pipeline's model to a batch of docs, without modifying them.

View File

@ -1,4 +1,4 @@
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List from typing import Sequence, Iterable, Optional, Dict, Callable, List
from thinc.api import Model, set_dropout_rate, Optimizer, Config from thinc.api import Model, set_dropout_rate, Optimizer, Config
from itertools import islice from itertools import islice
@ -8,8 +8,6 @@ from ..tokens import Doc
from ..vocab import Vocab from ..vocab import Vocab
from ..language import Language from ..language import Language
from ..errors import Errors from ..errors import Errors
from ..util import minibatch
default_model_config = """ default_model_config = """
[model] [model]
@ -99,36 +97,6 @@ class Tok2Vec(TrainablePipe):
if isinstance(node, Tok2VecListener) and node.upstream_name in names: if isinstance(node, Tok2VecListener) and node.upstream_name in names:
self.add_listener(node, component.name) self.add_listener(node, component.name)
def __call__(self, doc: Doc) -> Doc:
"""Add context-sensitive embeddings to the Doc.tensor attribute, allowing
them to be used as features by downstream components.
docs (Doc): The Doc to process.
RETURNS (Doc): The processed Doc.
DOCS: https://nightly.spacy.io/api/tok2vec#call
"""
tokvecses = self.predict([doc])
self.set_annotations([doc], tokvecses)
return doc
def pipe(self, stream: Iterator[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
applied to the Doc.
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
YIELDS (Doc): Processed documents in order.
DOCS: https://nightly.spacy.io/api/tok2vec#pipe
"""
for docs in minibatch(stream, batch_size):
docs = list(docs)
tokvecses = self.predict(docs)
self.set_annotations(docs, tokvecses)
yield from docs
def predict(self, docs: Iterable[Doc]): def predict(self, docs: Iterable[Doc]):
"""Apply the pipeline's model to a batch of docs, without modifying them. """Apply the pipeline's model to a batch of docs, without modifying them.
Returns a single tensor for a batch of documents. Returns a single tensor for a batch of documents.

View File

@ -28,7 +28,7 @@ cdef class TrainablePipe(Pipe):
vocab (Vocab): The shared vocabulary. vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component. model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name. name (str): The component instance name.
**cfg: Additonal settings and config parameters. **cfg: Additional settings and config parameters.
DOCS: https://nightly.spacy.io/api/pipe#init DOCS: https://nightly.spacy.io/api/pipe#init
""" """
@ -47,9 +47,13 @@ cdef class TrainablePipe(Pipe):
DOCS: https://nightly.spacy.io/api/pipe#call DOCS: https://nightly.spacy.io/api/pipe#call
""" """
error_handler = self.get_error_handler()
try:
scores = self.predict([doc]) scores = self.predict([doc])
self.set_annotations([doc], scores) self.set_annotations([doc], scores)
return doc return doc
except Exception as e:
error_handler(self.name, self, [doc], e)
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]: def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under """Apply the pipe to a stream of documents. This usually happens under
@ -58,14 +62,21 @@ cdef class TrainablePipe(Pipe):
stream (Iterable[Doc]): A stream of documents. stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer. batch_size (int): The number of documents to buffer.
error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
deals with a failing batch of documents. The default function just reraises
the exception.
YIELDS (Doc): Processed documents in order. YIELDS (Doc): Processed documents in order.
DOCS: https://nightly.spacy.io/api/pipe#pipe DOCS: https://nightly.spacy.io/api/pipe#pipe
""" """
error_handler = self.get_error_handler()
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
try:
scores = self.predict(docs) scores = self.predict(docs)
self.set_annotations(docs, scores) self.set_annotations(docs, scores)
yield from docs yield from docs
except Exception as e:
error_handler(self.name, self, docs, e)
def predict(self, docs: Iterable[Doc]): def predict(self, docs: Iterable[Doc]):
"""Apply the pipeline's model to a batch of docs, without modifying them. """Apply the pipeline's model to a batch of docs, without modifying them.

View File

@ -7,7 +7,6 @@ from libcpp.vector cimport vector
from libc.string cimport memset, memcpy from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
import random import random
from typing import Optional
import srsly import srsly
from thinc.api import set_dropout_rate, CupyOps from thinc.api import set_dropout_rate, CupyOps
@ -30,7 +29,6 @@ from ..training import validate_examples, validate_get_examples
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from .. import util from .. import util
cdef class Parser(TrainablePipe): cdef class Parser(TrainablePipe):
""" """
Base class of the DependencyParser and EntityRecognizer. Base class of the DependencyParser and EntityRecognizer.
@ -175,32 +173,31 @@ cdef class Parser(TrainablePipe):
with self.model.use_params(params): with self.model.use_params(params):
yield yield
def __call__(self, Doc doc):
"""Apply the parser or entity recognizer, setting the annotations onto
the `Doc` object.
doc (Doc): The document to be processed.
"""
states = self.predict([doc])
self.set_annotations([doc], states)
return doc
def pipe(self, docs, *, int batch_size=256): def pipe(self, docs, *, int batch_size=256):
"""Process a stream of documents. """Process a stream of documents.
stream: The sequence of documents to process. stream: The sequence of documents to process.
batch_size (int): Number of documents to accumulate into a working set. batch_size (int): Number of documents to accumulate into a working set.
error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
deals with a failing batch of documents. The default function just reraises
the exception.
YIELDS (Doc): Documents, in order. YIELDS (Doc): Documents, in order.
""" """
cdef Doc doc cdef Doc doc
error_handler = self.get_error_handler()
for batch in util.minibatch(docs, size=batch_size): for batch in util.minibatch(docs, size=batch_size):
batch_in_order = list(batch) batch_in_order = list(batch)
try:
by_length = sorted(batch, key=lambda doc: len(doc)) by_length = sorted(batch, key=lambda doc: len(doc))
for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)):
subbatch = list(subbatch) subbatch = list(subbatch)
parse_states = self.predict(subbatch) parse_states = self.predict(subbatch)
self.set_annotations(subbatch, parse_states) self.set_annotations(subbatch, parse_states)
yield from batch_in_order yield from batch_in_order
except Exception as e:
error_handler(self.name, self, batch_in_order, e)
def predict(self, docs): def predict(self, docs):
if isinstance(docs, Doc): if isinstance(docs, Doc):

View File

@ -2,6 +2,7 @@ import pytest
import pickle import pickle
import re import re
import copy import copy
import logging
from mock import Mock from mock import Mock
from spacy.matcher import DependencyMatcher from spacy.matcher import DependencyMatcher
from spacy.tokens import Doc from spacy.tokens import Doc
@ -334,3 +335,14 @@ def test_dependency_matcher_ops(en_vocab, doc, left, right, op, num_matches):
matcher.add("pattern", [pattern]) matcher.add("pattern", [pattern])
matches = matcher(doc) matches = matcher(doc)
assert len(matches) == num_matches assert len(matches) == num_matches
def test_dependency_matcher_long_matches(en_vocab, doc):
pattern = [
{"RIGHT_ID": "quick", "RIGHT_ATTRS": {"DEP": "amod", "OP": "+"}},
]
matcher = DependencyMatcher(en_vocab)
logger = logging.getLogger("spacy")
with pytest.raises(ValueError):
matcher.add("pattern", [pattern])

View File

@ -0,0 +1,23 @@
import pytest
from ..util import make_tempdir
def test_issue6730(en_vocab):
"""Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
from spacy.kb import KnowledgeBase
kb = KnowledgeBase(en_vocab, entity_vector_length=3)
kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])
with pytest.raises(ValueError):
kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
assert kb.contains_alias("") is False
kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])
with make_tempdir() as tmp_dir:
kb.to_disk(tmp_dir)
kb.from_disk(tmp_dir)
assert kb.get_size_aliases() == 2
assert set(kb.get_alias_strings()) == {"x", "y"}

View File

@ -1,4 +1,6 @@
import itertools import itertools
import logging
from unittest import mock
import pytest import pytest
from spacy.language import Language from spacy.language import Language
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
@ -6,7 +8,7 @@ from spacy.vocab import Vocab
from spacy.training import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.de import German from spacy.lang.de import German
from spacy.util import registry from spacy.util import registry, ignore_error, raise_error
import spacy import spacy
from .util import add_vecs_to_vocab, assert_docs_equal from .util import add_vecs_to_vocab, assert_docs_equal
@ -161,6 +163,81 @@ def test_language_pipe_stream(nlp2, n_process, texts):
assert_docs_equal(doc, expected_doc) assert_docs_equal(doc, expected_doc)
def test_language_pipe_error_handler():
"""Test that the error handling of nlp.pipe works well"""
nlp = English()
nlp.add_pipe("merge_subtokens")
nlp.initialize()
texts = ["Curious to see what will happen to this text.", "And this one."]
# the pipeline fails because there's no parser
with pytest.raises(ValueError):
nlp(texts[0])
with pytest.raises(ValueError):
list(nlp.pipe(texts))
nlp.set_error_handler(raise_error)
with pytest.raises(ValueError):
list(nlp.pipe(texts))
# set explicitely to ignoring
nlp.set_error_handler(ignore_error)
docs = list(nlp.pipe(texts))
assert len(docs) == 0
nlp(texts[0])
def test_language_pipe_error_handler_custom(en_vocab):
"""Test the error handling of a custom component that has no pipe method"""
@Language.component("my_evil_component")
def evil_component(doc):
if "2" in doc.text:
raise ValueError("no dice")
return doc
def warn_error(proc_name, proc, docs, e):
from spacy.util import logger
logger.warning(f"Trouble with component {proc_name}.")
nlp = English()
nlp.add_pipe("my_evil_component")
nlp.initialize()
texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"]
with pytest.raises(ValueError):
# the evil custom component throws an error
list(nlp.pipe(texts))
nlp.set_error_handler(warn_error)
logger = logging.getLogger("spacy")
with mock.patch.object(logger, "warning") as mock_warning:
# the errors by the evil custom component raise a warning for each bad batch
docs = list(nlp.pipe(texts))
mock_warning.assert_called()
assert mock_warning.call_count == 2
assert len(docs) + mock_warning.call_count == len(texts)
assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"]
def test_language_pipe_error_handler_pipe(en_vocab):
"""Test the error handling of a component's pipe method"""
@Language.component("my_sentences")
def perhaps_set_sentences(doc):
if not doc.text.startswith("4"):
doc[-1].is_sent_start = True
return doc
texts = [f"{str(i)} is enough. Done" for i in range(100)]
nlp = English()
nlp.add_pipe("my_sentences")
entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 3})
entity_linker.kb.add_entity(entity="Q1", freq=12, entity_vector=[1, 2, 3])
nlp.initialize()
with pytest.raises(ValueError):
# the entity linker requires sentence boundaries, will throw an error otherwise
docs = list(nlp.pipe(texts, batch_size=10))
nlp.set_error_handler(ignore_error)
docs = list(nlp.pipe(texts, batch_size=10))
# we lose/ignore the failing 0-9 and 40-49 batches
assert len(docs) == 80
def test_language_from_config_before_after_init(): def test_language_from_config_before_after_init():
name = "test_language_from_config_before_after_init" name = "test_language_from_config_before_after_init"
ran_before = False ran_before = False

View File

@ -356,7 +356,9 @@ def _add_entities_to_doc(doc, ner_data):
return return
elif ner_data == []: elif ner_data == []:
doc.ents = [] doc.ents = []
elif isinstance(ner_data[0], tuple): elif not isinstance(ner_data, (list, tuple)):
raise ValueError(Errors.E973)
elif isinstance(ner_data[0], (list, tuple)):
return _add_entities_to_doc( return _add_entities_to_doc(
doc, doc,
offsets_to_biluo_tags(doc, ner_data) offsets_to_biluo_tags(doc, ner_data)

View File

@ -1457,15 +1457,28 @@ def check_bool_env_var(env_var: str) -> bool:
return bool(value) return bool(value)
def _pipe(docs, proc, kwargs): def _pipe(docs, proc, name, default_error_handler, kwargs):
if hasattr(proc, "pipe"): if hasattr(proc, "pipe"):
yield from proc.pipe(docs, **kwargs) yield from proc.pipe(docs, **kwargs)
else: else:
# We added some args for pipe that __call__ doesn't expect. # We added some args for pipe that __call__ doesn't expect.
kwargs = dict(kwargs) kwargs = dict(kwargs)
error_handler = default_error_handler
if hasattr(proc, "get_error_handler"):
error_handler = proc.get_error_handler()
for arg in ["batch_size"]: for arg in ["batch_size"]:
if arg in kwargs: if arg in kwargs:
kwargs.pop(arg) kwargs.pop(arg)
for doc in docs: for doc in docs:
try:
doc = proc(doc, **kwargs) doc = proc(doc, **kwargs)
yield doc yield doc
except Exception as e:
error_handler(name, proc, [doc], e)
def raise_error(proc_name, proc, docs, e):
raise e
def ignore_error(proc_name, proc, docs, e):
pass

View File

@ -82,7 +82,7 @@ Add an alias or mention to the knowledge base, specifying its potential KB
identifiers and their prior probabilities. The entity identifiers should refer identifiers and their prior probabilities. The entity identifiers should refer
to entities previously added with [`add_entity`](/api/kb#add_entity) or to entities previously added with [`add_entity`](/api/kb#add_entity) or
[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities [`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
should not exceed 1. should not exceed 1. Note that an empty string can not be used as alias.
> #### Example > #### Example
> >
@ -92,7 +92,7 @@ should not exceed 1.
| Name | Description | | Name | Description |
| --------------- | --------------------------------------------------------------------------------- | | --------------- | --------------------------------------------------------------------------------- |
| `alias` | The textual mention or alias. ~~str~~ | | `alias` | The textual mention or alias. Can not be the empty string. ~~str~~ |
| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ | | `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ | | `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |

View File

@ -203,6 +203,28 @@ more efficient than processing texts one-by-one.
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ | | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ | | **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
## Language.set_error_handler {#set_error_handler tag="method"}
Define a callback that will be invoked when an error is thrown during processing
of one or more documents. Specifically, this function will call
[`set_error_handler`](/api/pipe#set_error_handler) on all the pipeline
components that define that function. The error handler will be invoked with the
original component's name, the component itself, the list of documents that was
being processed, and the original error.
> #### Example
>
> ```python
> def warn_error(proc_name, proc, docs, e):
> print(f"An error occurred when applying component {proc_name}.")
>
> nlp.set_error_handler(warn_error)
> ```
| Name | Description |
| --------------- | -------------------------------------------------------------------------------------------------------------- |
| `error_handler` | A function that performs custom error handling. ~~Callable[[str, Callable[[Doc], Doc], List[Doc], Exception]~~ |
## Language.initialize {#initialize tag="method" new="3"} ## Language.initialize {#initialize tag="method" new="3"}
Initialize the pipeline for training and return an Initialize the pipeline for training and return an

View File

@ -100,6 +100,47 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | The processed documents in order. ~~Doc~~ | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## TrainablePipe.set_error_handler {#set_error_handler tag="method"}
Define a callback that will be invoked when an error is thrown during processing
of one or more documents with either [`__call__`](/api/pipe#call) or
[`pipe`](/api/pipe#pipe). The error handler will be invoked with the original
component's name, the component itself, the list of documents that was being
processed, and the original error.
> #### Example
>
> ```python
> def warn_error(proc_name, proc, docs, e):
> print(f"An error occurred when applying component {proc_name}.")
>
> pipe = nlp.add_pipe("ner")
> pipe.set_error_handler(warn_error)
> ```
| Name | Description |
| --------------- | -------------------------------------------------------------------------------------------------------------- |
| `error_handler` | A function that performs custom error handling. ~~Callable[[str, Callable[[Doc], Doc], List[Doc], Exception]~~ |
## TrainablePipe.get_error_handler {#get_error_handler tag="method"}
Retrieve the callback that performs error handling for this component's
[`__call__`](/api/pipe#call) and [`pipe`](/api/pipe#pipe) methods. If no custom
function was previously defined with
[`set_error_handler`](/api/pipe#set_error_handler), a default function is
returned that simply reraises the exception.
> #### Example
>
> ```python
> pipe = nlp.add_pipe("ner")
> error_handler = pipe.get_error_handler()
> ```
| Name | Description |
| ----------- | ---------------------------------------------------------------------------------------------------------------- |
| **RETURNS** | The function that performs custom error handling. ~~Callable[[str, Callable[[Doc], Doc], List[Doc], Exception]~~ |
## TrainablePipe.initialize {#initialize tag="method" new="3"} ## TrainablePipe.initialize {#initialize tag="method" new="3"}
Initialize the component for training. `get_examples` should be a function that Initialize the component for training. `get_examples` should be a function that
@ -191,7 +232,7 @@ predictions and gold-standard annotations, and update the component's model.
> ``` > ```
| Name | Description | | Name | Description |
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `drop` | The dropout rate. ~~float~~ | | `drop` | The dropout rate. ~~float~~ |