mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	Merge branch 'develop' into feature/replace-listeners
This commit is contained in:
		
						commit
						01ecfbcc45
					
				|  | @ -132,6 +132,11 @@ class Warnings: | |||
|             "'morphologizer'.") | ||||
|     W109 = ("Unable to save user hooks while serializing the doc. Re-add any " | ||||
|             "required user hooks to the doc after processing.") | ||||
|     W110 = ("The DependencyMatcher token pattern {pattern} matched a span " | ||||
|             "{tokens} that is 2+ tokens long. Only the first token in the span " | ||||
|             "will be included in the results. For better results, token " | ||||
|             "patterns should return matches that are each exactly one token " | ||||
|             "long.") | ||||
| 
 | ||||
| 
 | ||||
| @add_codes | ||||
|  | @ -470,6 +475,10 @@ class Errors: | |||
|             "issue tracker: http://github.com/explosion/spaCy/issues") | ||||
| 
 | ||||
|     # TODO: fix numbering after merging develop into master | ||||
|     E890 = ("Can not add the alias '{alias}' to the Knowledge base. " | ||||
|             "Each alias should be a meaningful string.") | ||||
|     E891 = ("Alias '{alias}' could not be added to the Knowledge base. " | ||||
|             "This is likely a bug in spaCy.") | ||||
|     E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}") | ||||
|     E893 = ("Could not find function '{name}' in function registry '{reg_name}'. " | ||||
|             "If you're using a custom function, make sure the code is available. " | ||||
|  | @ -747,6 +756,10 @@ class Errors: | |||
|              "file.json .`.") | ||||
|     E1015 = ("Can't initialize model from config: no {value} found. For more " | ||||
|              "information, run: python -m spacy debug config config.cfg") | ||||
|     E1016 = ("The operators 'OP': '?', '*', and '+' are not supported in " | ||||
|              "DependencyMatcher token patterns. The token pattern in " | ||||
|              "RIGHT_ATTR should return matches that are each exactly one token " | ||||
|              "long. Invalid pattern:\n{node}") | ||||
| 
 | ||||
| 
 | ||||
| # Deprecated model shortcuts, only used in errors and warnings | ||||
|  |  | |||
|  | @ -187,6 +187,10 @@ cdef class KnowledgeBase: | |||
|         For a given alias, add its potential entities and prior probabilies to the KB. | ||||
|         Return the alias_hash at the end | ||||
|         """ | ||||
|         if alias is None or len(alias) == 0: | ||||
|             raise ValueError(Errors.E890.format(alias=alias)) | ||||
| 
 | ||||
|         previous_alias_nr = self.get_size_aliases() | ||||
|         # Throw an error if the length of entities and probabilities are not the same | ||||
|         if not len(entities) == len(probabilities): | ||||
|             raise ValueError(Errors.E132.format(alias=alias, | ||||
|  | @ -220,6 +224,8 @@ cdef class KnowledgeBase: | |||
|         new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) | ||||
|         self._alias_index[alias_hash] = new_index | ||||
| 
 | ||||
|         if previous_alias_nr + 1 != self.get_size_aliases(): | ||||
|             raise RuntimeError(Errors.E891.format(alias=alias)) | ||||
|         return alias_hash | ||||
| 
 | ||||
|     def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False): | ||||
|  |  | |||
|  | @ -20,7 +20,7 @@ from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis | |||
| from .training import Example, validate_examples | ||||
| from .training.initialize import init_vocab, init_tok2vec | ||||
| from .scorer import Scorer | ||||
| from .util import registry, SimpleFrozenList, _pipe | ||||
| from .util import registry, SimpleFrozenList, _pipe, raise_error | ||||
| from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER | ||||
| from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS | ||||
| from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES | ||||
|  | @ -176,6 +176,7 @@ class Language: | |||
|             create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"] | ||||
|         self.tokenizer = create_tokenizer(self) | ||||
|         self.batch_size = batch_size | ||||
|         self.default_error_handler = raise_error | ||||
| 
 | ||||
|     def __init_subclass__(cls, **kwargs): | ||||
|         super().__init_subclass__(**kwargs) | ||||
|  | @ -1022,11 +1023,16 @@ class Language: | |||
|                 continue | ||||
|             if not hasattr(proc, "__call__"): | ||||
|                 raise ValueError(Errors.E003.format(component=type(proc), name=name)) | ||||
|             error_handler = self.default_error_handler | ||||
|             if hasattr(proc, "get_error_handler"): | ||||
|                 error_handler = proc.get_error_handler() | ||||
|             try: | ||||
|                 doc = proc(doc, **component_cfg.get(name, {})) | ||||
|             except KeyError as e: | ||||
|                 # This typically happens if a component is not initialized | ||||
|                 raise ValueError(Errors.E109.format(name=name)) from e | ||||
|             except Exception as e: | ||||
|                 error_handler(name, proc, [doc], e) | ||||
|             if doc is None: | ||||
|                 raise ValueError(Errors.E005.format(name=name)) | ||||
|         return doc | ||||
|  | @ -1315,6 +1321,26 @@ class Language: | |||
|             self._optimizer = self.create_optimizer() | ||||
|         return self._optimizer | ||||
| 
 | ||||
|     def set_error_handler( | ||||
|         self, | ||||
|         error_handler: Callable[ | ||||
|             [str, Callable[[Doc], Doc], List[Doc], Exception], None | ||||
|         ], | ||||
|     ): | ||||
|         """Set an error handler object for all the components in the pipeline that implement | ||||
|         a set_error_handler function. | ||||
| 
 | ||||
|         error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], None]): | ||||
|             Function that deals with a failing batch of documents. This callable function should take in | ||||
|             the component's name, the component itself, the offending batch of documents, and the exception | ||||
|             that was thrown. | ||||
|         DOCS: https://nightly.spacy.io/api/language#set_error_handler | ||||
|         """ | ||||
|         self.default_error_handler = error_handler | ||||
|         for name, pipe in self.pipeline: | ||||
|             if hasattr(pipe, "set_error_handler"): | ||||
|                 pipe.set_error_handler(error_handler) | ||||
| 
 | ||||
|     def evaluate( | ||||
|         self, | ||||
|         examples: Iterable[Example], | ||||
|  | @ -1334,6 +1360,7 @@ class Language: | |||
|             arguments for specific components. | ||||
|         scorer_cfg (dict): An optional dictionary with extra keyword arguments | ||||
|             for the scorer. | ||||
| 
 | ||||
|         RETURNS (Scorer): The scorer containing the evaluation results. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/language#evaluate | ||||
|  | @ -1358,7 +1385,14 @@ class Language: | |||
|             kwargs = component_cfg.get(name, {}) | ||||
|             kwargs.setdefault("batch_size", batch_size) | ||||
|             for doc, eg in zip( | ||||
|                 _pipe((eg.predicted for eg in examples), pipe, kwargs), examples | ||||
|                 _pipe( | ||||
|                     (eg.predicted for eg in examples), | ||||
|                     proc=pipe, | ||||
|                     name=name, | ||||
|                     default_error_handler=self.default_error_handler, | ||||
|                     kwargs=kwargs, | ||||
|                 ), | ||||
|                 examples, | ||||
|             ): | ||||
|                 eg.predicted = doc | ||||
|         end_time = timer() | ||||
|  | @ -1463,7 +1497,13 @@ class Language: | |||
|             kwargs = component_cfg.get(name, {}) | ||||
|             # Allow component_cfg to overwrite the top-level kwargs. | ||||
|             kwargs.setdefault("batch_size", batch_size) | ||||
|             f = functools.partial(_pipe, proc=proc, kwargs=kwargs) | ||||
|             f = functools.partial( | ||||
|                 _pipe, | ||||
|                 proc=proc, | ||||
|                 name=name, | ||||
|                 kwargs=kwargs, | ||||
|                 default_error_handler=self.default_error_handler, | ||||
|             ) | ||||
|             pipes.append(f) | ||||
| 
 | ||||
|         if n_process != 1: | ||||
|  |  | |||
|  | @ -9,8 +9,9 @@ from .matcher cimport Matcher | |||
| from ..vocab cimport Vocab | ||||
| from ..tokens.doc cimport Doc | ||||
| 
 | ||||
| from ..errors import Errors | ||||
| from ..errors import Errors, Warnings | ||||
| from ..tokens import Span | ||||
| from ..util import logger | ||||
| 
 | ||||
| 
 | ||||
| DELIMITER = "||" | ||||
|  | @ -137,6 +138,8 @@ cdef class DependencyMatcher: | |||
|                     raise ValueError(Errors.E1007.format(op=relation["REL_OP"])) | ||||
|                 visited_nodes[relation["RIGHT_ID"]] = True | ||||
|                 visited_nodes[relation["LEFT_ID"]] = True | ||||
|             if relation["RIGHT_ATTRS"].get("OP", "") in ("?", "*", "+"): | ||||
|                 raise ValueError(Errors.E1016.format(node=relation)) | ||||
|             idx = idx + 1 | ||||
| 
 | ||||
|     def _get_matcher_key(self, key, pattern_idx, token_idx): | ||||
|  | @ -277,7 +280,9 @@ cdef class DependencyMatcher: | |||
|         e.g. keys_to_position_maps[root_index][match_id] = [...] | ||||
|         """ | ||||
|         keys_to_position_maps = defaultdict(lambda: defaultdict(list)) | ||||
|         for match_id, start, _ in self._matcher(doc): | ||||
|         for match_id, start, end in self._matcher(doc): | ||||
|             if start + 1 != end: | ||||
|                 logger.warning(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0])) | ||||
|             token = doc[start] | ||||
|             root = ([token] + list(token.ancestors))[-1] | ||||
|             keys_to_position_maps[root.i][match_id].append(start) | ||||
|  |  | |||
|  | @ -96,12 +96,25 @@ class AttributeRuler(Pipe): | |||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/attributeruler#call | ||||
|         """ | ||||
|         error_handler = self.get_error_handler() | ||||
|         try: | ||||
|             matches = self.match(doc) | ||||
|             self.set_annotations(doc, matches) | ||||
|             return doc | ||||
|         except Exception as e: | ||||
|             error_handler(self.name, self, [doc], e) | ||||
| 
 | ||||
|     def match(self, doc: Doc): | ||||
|         matches = self.matcher(doc, allow_missing=True) | ||||
|         # Sort by the attribute ID, so that later rules have precendence | ||||
|         matches = [ | ||||
|             (int(self.vocab.strings[m_id]), m_id, s, e) for m_id, s, e in matches | ||||
|         ] | ||||
|         matches.sort() | ||||
|         return matches | ||||
| 
 | ||||
|     def set_annotations(self, doc, matches): | ||||
|         """Modify the document in place""" | ||||
|         for attr_id, match_id, start, end in matches: | ||||
|             span = Span(doc, start, end, label=match_id) | ||||
|             attrs = self.attrs[attr_id] | ||||
|  | @ -121,7 +134,7 @@ class AttributeRuler(Pipe): | |||
|                     ) | ||||
|                 ) from None | ||||
|             set_token_attrs(span[index], attrs) | ||||
|         return doc | ||||
| 
 | ||||
| 
 | ||||
|     def load_from_tag_map( | ||||
|         self, tag_map: Dict[str, Dict[Union[int, str], Union[int, str]]] | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| from itertools import islice | ||||
| from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List | ||||
| from typing import Optional, Iterable, Callable, Dict, Union, List | ||||
| from pathlib import Path | ||||
| from itertools import islice | ||||
| import srsly | ||||
| import random | ||||
| from thinc.api import CosineDistance, Model, Optimizer, Config | ||||
|  | @ -276,34 +276,6 @@ class EntityLinker(TrainablePipe): | |||
|         loss = loss / len(entity_encodings) | ||||
|         return loss, gradients | ||||
| 
 | ||||
|     def __call__(self, doc: Doc) -> Doc: | ||||
|         """Apply the pipe to a Doc. | ||||
| 
 | ||||
|         doc (Doc): The document to process. | ||||
|         RETURNS (Doc): The processed Doc. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/entitylinker#call | ||||
|         """ | ||||
|         kb_ids = self.predict([doc]) | ||||
|         self.set_annotations([doc], kb_ids) | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: | ||||
|         """Apply the pipe to a stream of documents. This usually happens under | ||||
|         the hood when the nlp object is called on a text and all components are | ||||
|         applied to the Doc. | ||||
| 
 | ||||
|         stream (Iterable[Doc]): A stream of documents. | ||||
|         batch_size (int): The number of documents to buffer. | ||||
|         YIELDS (Doc): Processed documents in order. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/entitylinker#pipe | ||||
|         """ | ||||
|         for docs in util.minibatch(stream, size=batch_size): | ||||
|             kb_ids = self.predict(docs) | ||||
|             self.set_annotations(docs, kb_ids) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs: Iterable[Doc]) -> List[str]: | ||||
|         """Apply the pipeline's model to a batch of docs, without modifying them. | ||||
|         Returns the KB IDs for each entity in each doc, including NIL if there is | ||||
|  |  | |||
|  | @ -135,12 +135,25 @@ class EntityRuler(Pipe): | |||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/entityruler#call | ||||
|         """ | ||||
|         error_handler = self.get_error_handler() | ||||
|         try: | ||||
|             matches = self.match(doc) | ||||
|             self.set_annotations(doc, matches) | ||||
|             return doc | ||||
|         except Exception as e: | ||||
|             error_handler(self.name, self, [doc], e) | ||||
| 
 | ||||
|     def match(self, doc: Doc): | ||||
|         matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) | ||||
|         matches = set( | ||||
|             [(m_id, start, end) for m_id, start, end in matches if start != end] | ||||
|         ) | ||||
|         get_sort_key = lambda m: (m[2] - m[1], -m[1]) | ||||
|         matches = sorted(matches, key=get_sort_key, reverse=True) | ||||
|         return matches | ||||
| 
 | ||||
|     def set_annotations(self, doc, matches): | ||||
|         """Modify the document in place""" | ||||
|         entities = list(doc.ents) | ||||
|         new_entities = [] | ||||
|         seen_tokens = set() | ||||
|  | @ -163,7 +176,6 @@ class EntityRuler(Pipe): | |||
|                 ] | ||||
|                 seen_tokens.update(range(start, end)) | ||||
|         doc.ents = entities + new_entities | ||||
|         return doc | ||||
| 
 | ||||
|     @property | ||||
|     def labels(self) -> Tuple[str, ...]: | ||||
|  |  | |||
|  | @ -23,11 +23,7 @@ from .. import util | |||
|     default_score_weights={"lemma_acc": 1.0}, | ||||
| ) | ||||
| def make_lemmatizer( | ||||
|     nlp: Language, | ||||
|     model: Optional[Model], | ||||
|     name: str, | ||||
|     mode: str, | ||||
|     overwrite: bool = False, | ||||
|     nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool = False | ||||
| ): | ||||
|     return Lemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite) | ||||
| 
 | ||||
|  | @ -107,10 +103,14 @@ class Lemmatizer(Pipe): | |||
|         """ | ||||
|         if not self._validated: | ||||
|             self._validate_tables(Errors.E1004) | ||||
|         for token in doc: | ||||
|             if self.overwrite or token.lemma == 0: | ||||
|                 token.lemma_ = self.lemmatize(token)[0] | ||||
|         return doc | ||||
|         error_handler = self.get_error_handler() | ||||
|         try: | ||||
|             for token in doc: | ||||
|                 if self.overwrite or token.lemma == 0: | ||||
|                     token.lemma_ = self.lemmatize(token)[0] | ||||
|             return doc | ||||
|         except Exception as e: | ||||
|             error_handler(self.name, self, [doc], e) | ||||
| 
 | ||||
|     def initialize( | ||||
|         self, | ||||
|  | @ -154,21 +154,6 @@ class Lemmatizer(Pipe): | |||
|                 ) | ||||
|         self._validated = True | ||||
| 
 | ||||
|     def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: | ||||
|         """Apply the pipe to a stream of documents. This usually happens under | ||||
|         the hood when the nlp object is called on a text and all components are | ||||
|         applied to the Doc. | ||||
| 
 | ||||
|         stream (Iterable[Doc]): A stream of documents. | ||||
|         batch_size (int): The number of documents to buffer. | ||||
|         YIELDS (Doc): Processed documents in order. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/lemmatizer#pipe | ||||
|         """ | ||||
|         for doc in stream: | ||||
|             doc = self(doc) | ||||
|             yield doc | ||||
| 
 | ||||
|     def lookup_lemmatize(self, token: Token) -> List[str]: | ||||
|         """Lemmatize using a lookup-based approach. | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,13 +1,14 @@ | |||
| # cython: infer_types=True, profile=True | ||||
| import warnings | ||||
| from typing import Optional, Tuple, Iterable, Iterator, Callable, Union, Dict | ||||
| import srsly | ||||
| import warnings | ||||
| 
 | ||||
| from ..tokens.doc cimport Doc | ||||
| 
 | ||||
| from ..training import Example | ||||
| from ..errors import Errors, Warnings | ||||
| from ..language import Language | ||||
| from ..util import raise_error | ||||
| 
 | ||||
| cdef class Pipe: | ||||
|     """This class is a base class and not instantiated directly. It provides | ||||
|  | @ -48,9 +49,13 @@ cdef class Pipe: | |||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/pipe#pipe | ||||
|         """ | ||||
|         error_handler = self.get_error_handler() | ||||
|         for doc in stream: | ||||
|             doc = self(doc) | ||||
|             yield doc | ||||
|             try: | ||||
|                 doc = self(doc) | ||||
|                 yield doc | ||||
|             except Exception as e: | ||||
|                 error_handler(self.name, self, [doc], e) | ||||
| 
 | ||||
|     def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None): | ||||
|         """Initialize the pipe. For non-trainable components, this method | ||||
|  | @ -98,6 +103,30 @@ cdef class Pipe: | |||
|         if not self.labels or list(self.labels) == [""]: | ||||
|             raise ValueError(Errors.E143.format(name=self.name)) | ||||
| 
 | ||||
|     def set_error_handler(self, error_handler: Callable) -> None: | ||||
|         """Set an error handler function. | ||||
| 
 | ||||
|         error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], None]): | ||||
|             Function that deals with a failing batch of documents. This callable function should take in | ||||
|             the component's name, the component itself, the offending batch of documents, and the exception | ||||
|             that was thrown. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/pipe#set_error_handler | ||||
|         """ | ||||
|         self.error_handler = error_handler | ||||
| 
 | ||||
|     def get_error_handler(self) -> Optional[Callable]: | ||||
|         """Retrieve the error handler function. | ||||
| 
 | ||||
|         RETURNS (Callable): The error handler, or if it's not set a default function that just reraises. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/pipe#get_error_handler | ||||
|         """ | ||||
|         if hasattr(self, "error_handler"): | ||||
|             return self.error_handler | ||||
|         return raise_error | ||||
| 
 | ||||
| 
 | ||||
| def deserialize_config(path): | ||||
|     if path.exists(): | ||||
|         return srsly.read_json(path) | ||||
|  |  | |||
|  | @ -1,16 +1,14 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| import srsly | ||||
| from typing import Optional, List | ||||
| import srsly | ||||
| 
 | ||||
| from ..tokens.doc cimport Doc | ||||
| 
 | ||||
| from .pipe import Pipe | ||||
| from ..language import Language | ||||
| from ..scorer import Scorer | ||||
| from ..training import validate_examples | ||||
| from .. import util | ||||
| 
 | ||||
| 
 | ||||
| @Language.factory( | ||||
|     "sentencizer", | ||||
|     assigns=["token.is_sent_start", "doc.sents"], | ||||
|  | @ -66,6 +64,14 @@ class Sentencizer(Pipe): | |||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/sentencizer#call | ||||
|         """ | ||||
|         error_handler = self.get_error_handler() | ||||
|         try: | ||||
|             self._call(doc) | ||||
|             return doc | ||||
|         except Exception as e: | ||||
|             error_handler(self.name, self, [doc], e) | ||||
| 
 | ||||
|     def _call(self, doc): | ||||
|         start = 0 | ||||
|         seen_period = False | ||||
|         for i, token in enumerate(doc): | ||||
|  | @ -79,23 +85,6 @@ class Sentencizer(Pipe): | |||
|                 seen_period = True | ||||
|         if start < len(doc): | ||||
|             doc[start].is_sent_start = True | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, stream, batch_size=128): | ||||
|         """Apply the pipe to a stream of documents. This usually happens under | ||||
|         the hood when the nlp object is called on a text and all components are | ||||
|         applied to the Doc. | ||||
| 
 | ||||
|         stream (Iterable[Doc]): A stream of documents. | ||||
|         batch_size (int): The number of documents to buffer. | ||||
|         YIELDS (Doc): Processed documents in order. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/sentencizer#pipe | ||||
|         """ | ||||
|         for docs in util.minibatch(stream, size=batch_size): | ||||
|             predictions = self.predict(docs) | ||||
|             self.set_annotations(docs, predictions) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs): | ||||
|         """Apply the pipe to a batch of docs, without modifying them. | ||||
|  |  | |||
|  | @ -1,5 +1,4 @@ | |||
| # cython: infer_types=True, profile=True, binding=True | ||||
| from typing import List | ||||
| import numpy | ||||
| import srsly | ||||
| from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config | ||||
|  | @ -95,34 +94,6 @@ class Tagger(TrainablePipe): | |||
|         """Data about the labels currently added to the component.""" | ||||
|         return tuple(self.cfg["labels"]) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         """Apply the pipe to a Doc. | ||||
| 
 | ||||
|         doc (Doc): The document to process. | ||||
|         RETURNS (Doc): The processed Doc. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/tagger#call | ||||
|         """ | ||||
|         tags = self.predict([doc]) | ||||
|         self.set_annotations([doc], tags) | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, stream, *, batch_size=128): | ||||
|         """Apply the pipe to a stream of documents. This usually happens under | ||||
|         the hood when the nlp object is called on a text and all components are | ||||
|         applied to the Doc. | ||||
| 
 | ||||
|         stream (Iterable[Doc]): A stream of documents. | ||||
|         batch_size (int): The number of documents to buffer. | ||||
|         YIELDS (Doc): Processed documents in order. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/tagger#pipe | ||||
|         """ | ||||
|         for docs in util.minibatch(stream, size=batch_size): | ||||
|             tag_ids = self.predict(docs) | ||||
|             self.set_annotations(docs, tag_ids) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs): | ||||
|         """Apply the pipeline's model to a batch of docs, without modifying them. | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,5 +1,5 @@ | |||
| from itertools import islice | ||||
| from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any | ||||
| from typing import Iterable, Tuple, Optional, Dict, List, Callable, Any | ||||
| from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config | ||||
| from thinc.types import Floats2d | ||||
| import numpy | ||||
|  | @ -9,7 +9,6 @@ from ..language import Language | |||
| from ..training import Example, validate_examples, validate_get_examples | ||||
| from ..errors import Errors | ||||
| from ..scorer import Scorer | ||||
| from .. import util | ||||
| from ..tokens import Doc | ||||
| from ..vocab import Vocab | ||||
| 
 | ||||
|  | @ -144,22 +143,6 @@ class TextCategorizer(TrainablePipe): | |||
|         """ | ||||
|         return self.labels | ||||
| 
 | ||||
|     def pipe(self, stream: Iterable[Doc], *, batch_size: int = 128) -> Iterator[Doc]: | ||||
|         """Apply the pipe to a stream of documents. This usually happens under | ||||
|         the hood when the nlp object is called on a text and all components are | ||||
|         applied to the Doc. | ||||
| 
 | ||||
|         stream (Iterable[Doc]): A stream of documents. | ||||
|         batch_size (int): The number of documents to buffer. | ||||
|         YIELDS (Doc): Processed documents in order. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/textcategorizer#pipe | ||||
|         """ | ||||
|         for docs in util.minibatch(stream, size=batch_size): | ||||
|             scores = self.predict(docs) | ||||
|             self.set_annotations(docs, scores) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs: Iterable[Doc]): | ||||
|         """Apply the pipeline's model to a batch of docs, without modifying them. | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,4 +1,4 @@ | |||
| from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List | ||||
| from typing import Sequence, Iterable, Optional, Dict, Callable, List | ||||
| from thinc.api import Model, set_dropout_rate, Optimizer, Config | ||||
| from itertools import islice | ||||
| 
 | ||||
|  | @ -8,8 +8,6 @@ from ..tokens import Doc | |||
| from ..vocab import Vocab | ||||
| from ..language import Language | ||||
| from ..errors import Errors | ||||
| from ..util import minibatch | ||||
| 
 | ||||
| 
 | ||||
| default_model_config = """ | ||||
| [model] | ||||
|  | @ -99,36 +97,6 @@ class Tok2Vec(TrainablePipe): | |||
|                 if isinstance(node, Tok2VecListener) and node.upstream_name in names: | ||||
|                     self.add_listener(node, component.name) | ||||
| 
 | ||||
|     def __call__(self, doc: Doc) -> Doc: | ||||
|         """Add context-sensitive embeddings to the Doc.tensor attribute, allowing | ||||
|         them to be used as features by downstream components. | ||||
| 
 | ||||
|         docs (Doc): The Doc to process. | ||||
|         RETURNS (Doc): The processed Doc. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/tok2vec#call | ||||
|         """ | ||||
|         tokvecses = self.predict([doc]) | ||||
|         self.set_annotations([doc], tokvecses) | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, stream: Iterator[Doc], *, batch_size: int = 128) -> Iterator[Doc]: | ||||
|         """Apply the pipe to a stream of documents. This usually happens under | ||||
|         the hood when the nlp object is called on a text and all components are | ||||
|         applied to the Doc. | ||||
| 
 | ||||
|         stream (Iterable[Doc]): A stream of documents. | ||||
|         batch_size (int): The number of documents to buffer. | ||||
|         YIELDS (Doc): Processed documents in order. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/tok2vec#pipe | ||||
|         """ | ||||
|         for docs in minibatch(stream, batch_size): | ||||
|             docs = list(docs) | ||||
|             tokvecses = self.predict(docs) | ||||
|             self.set_annotations(docs, tokvecses) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, docs: Iterable[Doc]): | ||||
|         """Apply the pipeline's model to a batch of docs, without modifying them. | ||||
|         Returns a single tensor for a batch of documents. | ||||
|  |  | |||
|  | @ -28,7 +28,7 @@ cdef class TrainablePipe(Pipe): | |||
|         vocab (Vocab): The shared vocabulary. | ||||
|         model (thinc.api.Model): The Thinc Model powering the pipeline component. | ||||
|         name (str): The component instance name. | ||||
|         **cfg: Additonal settings and config parameters. | ||||
|         **cfg: Additional settings and config parameters. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/pipe#init | ||||
|         """ | ||||
|  | @ -47,9 +47,13 @@ cdef class TrainablePipe(Pipe): | |||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/pipe#call | ||||
|         """ | ||||
|         scores = self.predict([doc]) | ||||
|         self.set_annotations([doc], scores) | ||||
|         return doc | ||||
|         error_handler = self.get_error_handler() | ||||
|         try: | ||||
|             scores = self.predict([doc]) | ||||
|             self.set_annotations([doc], scores) | ||||
|             return doc | ||||
|         except Exception as e: | ||||
|             error_handler(self.name, self, [doc], e) | ||||
| 
 | ||||
|     def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]: | ||||
|         """Apply the pipe to a stream of documents. This usually happens under | ||||
|  | @ -58,14 +62,21 @@ cdef class TrainablePipe(Pipe): | |||
| 
 | ||||
|         stream (Iterable[Doc]): A stream of documents. | ||||
|         batch_size (int): The number of documents to buffer. | ||||
|         error_handler (Callable[[str, List[Doc], Exception], Any]): Function that | ||||
|             deals with a failing batch of documents. The default function just reraises | ||||
|             the exception. | ||||
|         YIELDS (Doc): Processed documents in order. | ||||
| 
 | ||||
|         DOCS: https://nightly.spacy.io/api/pipe#pipe | ||||
|         """ | ||||
|         error_handler = self.get_error_handler() | ||||
|         for docs in util.minibatch(stream, size=batch_size): | ||||
|             scores = self.predict(docs) | ||||
|             self.set_annotations(docs, scores) | ||||
|             yield from docs | ||||
|             try: | ||||
|                 scores = self.predict(docs) | ||||
|                 self.set_annotations(docs, scores) | ||||
|                 yield from docs | ||||
|             except Exception as e: | ||||
|                 error_handler(self.name, self, docs, e) | ||||
| 
 | ||||
|     def predict(self, docs: Iterable[Doc]): | ||||
|         """Apply the pipeline's model to a batch of docs, without modifying them. | ||||
|  |  | |||
|  | @ -7,7 +7,6 @@ from libcpp.vector cimport vector | |||
| from libc.string cimport memset, memcpy | ||||
| from libc.stdlib cimport calloc, free | ||||
| import random | ||||
| from typing import Optional | ||||
| 
 | ||||
| import srsly | ||||
| from thinc.api import set_dropout_rate, CupyOps | ||||
|  | @ -30,7 +29,6 @@ from ..training import validate_examples, validate_get_examples | |||
| from ..errors import Errors, Warnings | ||||
| from .. import util | ||||
| 
 | ||||
| 
 | ||||
| cdef class Parser(TrainablePipe): | ||||
|     """ | ||||
|     Base class of the DependencyParser and EntityRecognizer. | ||||
|  | @ -175,32 +173,31 @@ cdef class Parser(TrainablePipe): | |||
|         with self.model.use_params(params): | ||||
|             yield | ||||
| 
 | ||||
|     def __call__(self, Doc doc): | ||||
|         """Apply the parser or entity recognizer, setting the annotations onto | ||||
|         the `Doc` object. | ||||
| 
 | ||||
|         doc (Doc): The document to be processed. | ||||
|         """ | ||||
|         states = self.predict([doc]) | ||||
|         self.set_annotations([doc], states) | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, docs, *, int batch_size=256): | ||||
|         """Process a stream of documents. | ||||
| 
 | ||||
|         stream: The sequence of documents to process. | ||||
|         batch_size (int): Number of documents to accumulate into a working set. | ||||
|         error_handler (Callable[[str, List[Doc], Exception], Any]): Function that | ||||
|             deals with a failing batch of documents. The default function just reraises | ||||
|             the exception. | ||||
| 
 | ||||
|         YIELDS (Doc): Documents, in order. | ||||
|         """ | ||||
|         cdef Doc doc | ||||
|         error_handler = self.get_error_handler() | ||||
|         for batch in util.minibatch(docs, size=batch_size): | ||||
|             batch_in_order = list(batch) | ||||
|             by_length = sorted(batch, key=lambda doc: len(doc)) | ||||
|             for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): | ||||
|                 subbatch = list(subbatch) | ||||
|                 parse_states = self.predict(subbatch) | ||||
|                 self.set_annotations(subbatch, parse_states) | ||||
|             yield from batch_in_order | ||||
|             try: | ||||
|                 by_length = sorted(batch, key=lambda doc: len(doc)) | ||||
|                 for subbatch in util.minibatch(by_length, size=max(batch_size//4, 2)): | ||||
|                     subbatch = list(subbatch) | ||||
|                     parse_states = self.predict(subbatch) | ||||
|                     self.set_annotations(subbatch, parse_states) | ||||
|                 yield from batch_in_order | ||||
|             except Exception as e: | ||||
|                 error_handler(self.name, self, batch_in_order, e) | ||||
| 
 | ||||
| 
 | ||||
|     def predict(self, docs): | ||||
|         if isinstance(docs, Doc): | ||||
|  |  | |||
|  | @ -2,6 +2,7 @@ import pytest | |||
| import pickle | ||||
| import re | ||||
| import copy | ||||
| import logging | ||||
| from mock import Mock | ||||
| from spacy.matcher import DependencyMatcher | ||||
| from spacy.tokens import Doc | ||||
|  | @ -334,3 +335,14 @@ def test_dependency_matcher_ops(en_vocab, doc, left, right, op, num_matches): | |||
|     matcher.add("pattern", [pattern]) | ||||
|     matches = matcher(doc) | ||||
|     assert len(matches) == num_matches | ||||
| 
 | ||||
| 
 | ||||
| def test_dependency_matcher_long_matches(en_vocab, doc): | ||||
|     pattern = [ | ||||
|         {"RIGHT_ID": "quick", "RIGHT_ATTRS": {"DEP": "amod", "OP": "+"}}, | ||||
|     ] | ||||
| 
 | ||||
|     matcher = DependencyMatcher(en_vocab) | ||||
|     logger = logging.getLogger("spacy") | ||||
|     with pytest.raises(ValueError): | ||||
|         matcher.add("pattern", [pattern]) | ||||
|  |  | |||
							
								
								
									
										23
									
								
								spacy/tests/regression/test_issue6730.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/tests/regression/test_issue6730.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,23 @@ | |||
| import pytest | ||||
| from ..util import make_tempdir | ||||
| 
 | ||||
| 
 | ||||
| def test_issue6730(en_vocab): | ||||
|     """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" | ||||
|     from spacy.kb import KnowledgeBase | ||||
| 
 | ||||
|     kb = KnowledgeBase(en_vocab, entity_vector_length=3) | ||||
|     kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) | ||||
| 
 | ||||
|     with pytest.raises(ValueError): | ||||
|         kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) | ||||
|     assert kb.contains_alias("") is False | ||||
| 
 | ||||
|     kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) | ||||
|     kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) | ||||
| 
 | ||||
|     with make_tempdir() as tmp_dir: | ||||
|         kb.to_disk(tmp_dir) | ||||
|         kb.from_disk(tmp_dir) | ||||
|     assert kb.get_size_aliases() == 2 | ||||
|     assert set(kb.get_alias_strings()) == {"x", "y"} | ||||
|  | @ -1,4 +1,6 @@ | |||
| import itertools | ||||
| import logging | ||||
| from unittest import mock | ||||
| import pytest | ||||
| from spacy.language import Language | ||||
| from spacy.tokens import Doc, Span | ||||
|  | @ -6,7 +8,7 @@ from spacy.vocab import Vocab | |||
| from spacy.training import Example | ||||
| from spacy.lang.en import English | ||||
| from spacy.lang.de import German | ||||
| from spacy.util import registry | ||||
| from spacy.util import registry, ignore_error, raise_error | ||||
| import spacy | ||||
| 
 | ||||
| from .util import add_vecs_to_vocab, assert_docs_equal | ||||
|  | @ -161,6 +163,81 @@ def test_language_pipe_stream(nlp2, n_process, texts): | |||
|         assert_docs_equal(doc, expected_doc) | ||||
| 
 | ||||
| 
 | ||||
| def test_language_pipe_error_handler(): | ||||
|     """Test that the error handling of nlp.pipe works well""" | ||||
|     nlp = English() | ||||
|     nlp.add_pipe("merge_subtokens") | ||||
|     nlp.initialize() | ||||
|     texts = ["Curious to see what will happen to this text.", "And this one."] | ||||
|     # the pipeline fails because there's no parser | ||||
|     with pytest.raises(ValueError): | ||||
|         nlp(texts[0]) | ||||
|     with pytest.raises(ValueError): | ||||
|         list(nlp.pipe(texts)) | ||||
|     nlp.set_error_handler(raise_error) | ||||
|     with pytest.raises(ValueError): | ||||
|         list(nlp.pipe(texts)) | ||||
|     # set explicitely to ignoring | ||||
|     nlp.set_error_handler(ignore_error) | ||||
|     docs = list(nlp.pipe(texts)) | ||||
|     assert len(docs) == 0 | ||||
|     nlp(texts[0]) | ||||
| 
 | ||||
| 
 | ||||
| def test_language_pipe_error_handler_custom(en_vocab): | ||||
|     """Test the error handling of a custom component that has no pipe method""" | ||||
|     @Language.component("my_evil_component") | ||||
|     def evil_component(doc): | ||||
|         if "2" in doc.text: | ||||
|             raise ValueError("no dice") | ||||
|         return doc | ||||
| 
 | ||||
|     def warn_error(proc_name, proc, docs, e): | ||||
|         from spacy.util import logger | ||||
|         logger.warning(f"Trouble with component {proc_name}.") | ||||
| 
 | ||||
|     nlp = English() | ||||
|     nlp.add_pipe("my_evil_component") | ||||
|     nlp.initialize() | ||||
|     texts = ["TEXT 111", "TEXT 222", "TEXT 333", "TEXT 342", "TEXT 666"] | ||||
|     with pytest.raises(ValueError): | ||||
|         # the evil custom component throws an error | ||||
|         list(nlp.pipe(texts)) | ||||
| 
 | ||||
|     nlp.set_error_handler(warn_error) | ||||
|     logger = logging.getLogger("spacy") | ||||
|     with mock.patch.object(logger, "warning") as mock_warning: | ||||
|         # the errors by the evil custom component raise a warning for each bad batch | ||||
|         docs = list(nlp.pipe(texts)) | ||||
|         mock_warning.assert_called() | ||||
|         assert mock_warning.call_count == 2 | ||||
|         assert len(docs) + mock_warning.call_count == len(texts) | ||||
|         assert [doc.text for doc in docs] == ["TEXT 111", "TEXT 333", "TEXT 666"] | ||||
| 
 | ||||
| 
 | ||||
| def test_language_pipe_error_handler_pipe(en_vocab): | ||||
|     """Test the error handling of a component's pipe method""" | ||||
|     @Language.component("my_sentences") | ||||
|     def perhaps_set_sentences(doc): | ||||
|         if not doc.text.startswith("4"): | ||||
|             doc[-1].is_sent_start = True | ||||
|         return doc | ||||
| 
 | ||||
|     texts = [f"{str(i)} is enough. Done" for i in range(100)] | ||||
|     nlp = English() | ||||
|     nlp.add_pipe("my_sentences") | ||||
|     entity_linker = nlp.add_pipe("entity_linker", config={"entity_vector_length": 3}) | ||||
|     entity_linker.kb.add_entity(entity="Q1", freq=12, entity_vector=[1, 2, 3]) | ||||
|     nlp.initialize() | ||||
|     with pytest.raises(ValueError): | ||||
|         # the entity linker requires sentence boundaries, will throw an error otherwise | ||||
|         docs = list(nlp.pipe(texts, batch_size=10)) | ||||
|     nlp.set_error_handler(ignore_error) | ||||
|     docs = list(nlp.pipe(texts, batch_size=10)) | ||||
|     # we lose/ignore the failing 0-9 and 40-49 batches | ||||
|     assert len(docs) == 80 | ||||
| 
 | ||||
| 
 | ||||
| def test_language_from_config_before_after_init(): | ||||
|     name = "test_language_from_config_before_after_init" | ||||
|     ran_before = False | ||||
|  |  | |||
|  | @ -356,7 +356,9 @@ def _add_entities_to_doc(doc, ner_data): | |||
|         return | ||||
|     elif ner_data == []: | ||||
|         doc.ents = [] | ||||
|     elif isinstance(ner_data[0], tuple): | ||||
|     elif not isinstance(ner_data, (list, tuple)): | ||||
|         raise ValueError(Errors.E973) | ||||
|     elif isinstance(ner_data[0], (list, tuple)): | ||||
|         return _add_entities_to_doc( | ||||
|             doc, | ||||
|             offsets_to_biluo_tags(doc, ner_data) | ||||
|  |  | |||
|  | @ -1457,15 +1457,28 @@ def check_bool_env_var(env_var: str) -> bool: | |||
|     return bool(value) | ||||
| 
 | ||||
| 
 | ||||
| def _pipe(docs, proc, kwargs): | ||||
| def _pipe(docs, proc, name, default_error_handler, kwargs): | ||||
|     if hasattr(proc, "pipe"): | ||||
|         yield from proc.pipe(docs, **kwargs) | ||||
|     else: | ||||
|         # We added some args for pipe that __call__ doesn't expect. | ||||
|         kwargs = dict(kwargs) | ||||
|         error_handler = default_error_handler | ||||
|         if hasattr(proc, "get_error_handler"): | ||||
|             error_handler = proc.get_error_handler() | ||||
|         for arg in ["batch_size"]: | ||||
|             if arg in kwargs: | ||||
|                 kwargs.pop(arg) | ||||
|         for doc in docs: | ||||
|             doc = proc(doc, **kwargs) | ||||
|             yield doc | ||||
|             try: | ||||
|                 doc = proc(doc, **kwargs) | ||||
|                 yield doc | ||||
|             except Exception as e: | ||||
|                 error_handler(name, proc, [doc], e) | ||||
| 
 | ||||
| 
 | ||||
| def raise_error(proc_name, proc, docs, e): | ||||
|     raise e | ||||
| 
 | ||||
| def ignore_error(proc_name, proc, docs, e): | ||||
|     pass | ||||
|  |  | |||
|  | @ -82,7 +82,7 @@ Add an alias or mention to the knowledge base, specifying its potential KB | |||
| identifiers and their prior probabilities. The entity identifiers should refer | ||||
| to entities previously added with [`add_entity`](/api/kb#add_entity) or | ||||
| [`set_entities`](/api/kb#set_entities). The sum of the prior probabilities | ||||
| should not exceed 1. | ||||
| should not exceed 1. Note that an empty string can not be used as alias. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -92,7 +92,7 @@ should not exceed 1. | |||
| 
 | ||||
| | Name            | Description                                                                       | | ||||
| | --------------- | --------------------------------------------------------------------------------- | | ||||
| | `alias`         | The textual mention or alias. ~~str~~                                             | | ||||
| | `alias`         | The textual mention or alias. Can not be the empty string. ~~str~~                | | ||||
| | `entities`      | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ | | ||||
| | `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~                       | | ||||
| 
 | ||||
|  |  | |||
|  | @ -203,6 +203,28 @@ more efficient than processing texts one-by-one. | |||
| | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               | | ||||
| | **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                | | ||||
| 
 | ||||
| ## Language.set_error_handler {#set_error_handler tag="method"} | ||||
| 
 | ||||
| Define a callback that will be invoked when an error is thrown during processing | ||||
| of one or more documents. Specifically, this function will call | ||||
| [`set_error_handler`](/api/pipe#set_error_handler) on all the pipeline | ||||
| components that define that function. The error handler will be invoked with the | ||||
| original component's name, the component itself, the list of documents that was | ||||
| being processed, and the original error. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > def warn_error(proc_name, proc, docs, e): | ||||
| >     print(f"An error occurred when applying component {proc_name}.") | ||||
| > | ||||
| > nlp.set_error_handler(warn_error) | ||||
| > ``` | ||||
| 
 | ||||
| | Name            | Description                                                                                                    | | ||||
| | --------------- | -------------------------------------------------------------------------------------------------------------- | | ||||
| | `error_handler` | A function that performs custom error handling. ~~Callable[[str, Callable[[Doc], Doc], List[Doc], Exception]~~ | | ||||
| 
 | ||||
| ## Language.initialize {#initialize tag="method" new="3"} | ||||
| 
 | ||||
| Initialize the pipeline for training and return an | ||||
|  |  | |||
|  | @ -100,6 +100,47 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and | |||
| | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ | | ||||
| | **YIELDS**     | The processed documents in order. ~~Doc~~                     | | ||||
| 
 | ||||
| ## TrainablePipe.set_error_handler {#set_error_handler tag="method"} | ||||
| 
 | ||||
| Define a callback that will be invoked when an error is thrown during processing | ||||
| of one or more documents with either [`__call__`](/api/pipe#call) or | ||||
| [`pipe`](/api/pipe#pipe). The error handler will be invoked with the original | ||||
| component's name, the component itself, the list of documents that was being | ||||
| processed, and the original error. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > def warn_error(proc_name, proc, docs, e): | ||||
| >     print(f"An error occurred when applying component {proc_name}.") | ||||
| > | ||||
| > pipe = nlp.add_pipe("ner") | ||||
| > pipe.set_error_handler(warn_error) | ||||
| > ``` | ||||
| 
 | ||||
| | Name            | Description                                                                                                    | | ||||
| | --------------- | -------------------------------------------------------------------------------------------------------------- | | ||||
| | `error_handler` | A function that performs custom error handling. ~~Callable[[str, Callable[[Doc], Doc], List[Doc], Exception]~~ | | ||||
| 
 | ||||
| ## TrainablePipe.get_error_handler {#get_error_handler tag="method"} | ||||
| 
 | ||||
| Retrieve the callback that performs error handling for this component's | ||||
| [`__call__`](/api/pipe#call) and [`pipe`](/api/pipe#pipe) methods. If no custom | ||||
| function was previously defined with | ||||
| [`set_error_handler`](/api/pipe#set_error_handler), a default function is | ||||
| returned that simply reraises the exception. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > pipe = nlp.add_pipe("ner") | ||||
| > error_handler = pipe.get_error_handler() | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Description                                                                                                      | | ||||
| | ----------- | ---------------------------------------------------------------------------------------------------------------- | | ||||
| | **RETURNS** | The function that performs custom error handling. ~~Callable[[str, Callable[[Doc], Doc], List[Doc], Exception]~~ | | ||||
| 
 | ||||
| ## TrainablePipe.initialize {#initialize tag="method" new="3"} | ||||
| 
 | ||||
| Initialize the component for training. `get_examples` should be a function that | ||||
|  | @ -190,14 +231,14 @@ predictions and gold-standard annotations, and update the component's model. | |||
| > losses = pipe.update(examples, sgd=optimizer) | ||||
| > ``` | ||||
| 
 | ||||
| | Name              | Description                                                                                                                        | | ||||
| | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  | | ||||
| | _keyword-only_    |                                                                                                                                    | | ||||
| | `drop`            | The dropout rate. ~~float~~                                                                                                        | | ||||
| | `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      | | ||||
| | `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           | | ||||
| | **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              | | ||||
| | Name           | Description                                                                                                              | | ||||
| | -------------- | ------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        | | ||||
| | _keyword-only_ |                                                                                                                          | | ||||
| | `drop`         | The dropout rate. ~~float~~                                                                                              | | ||||
| | `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            | | ||||
| | `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | | ||||
| | **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    | | ||||
| 
 | ||||
| ## TrainablePipe.rehearse {#rehearse tag="method,experimental" new="3"} | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user