From 787d066e223b23198d2b8d6ce9249bbde8266b4e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 26 Jul 2020 13:08:52 +0200 Subject: [PATCH] Remove pipes.pyx Probably accidentally re-added in a merge? --- spacy/pipeline/pipes.pyx | 1519 -------------------------------------- 1 file changed, 1519 deletions(-) delete mode 100644 spacy/pipeline/pipes.pyx diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx deleted file mode 100644 index 5c71a33e7..000000000 --- a/spacy/pipeline/pipes.pyx +++ /dev/null @@ -1,1519 +0,0 @@ -# cython: infer_types=True, profile=True -import numpy -import srsly -import random - -from thinc.api import CosineDistance, to_categorical, get_array_module -from thinc.api import set_dropout_rate, SequenceCategoricalCrossentropy -import warnings - -from ..tokens.doc cimport Doc -from ..syntax.nn_parser cimport Parser -from ..syntax.ner cimport BiluoPushDown -from ..syntax.arc_eager cimport ArcEager -from ..morphology cimport Morphology -from ..vocab cimport Vocab - -from .defaults import default_tagger, default_parser, default_ner, default_textcat -from .defaults import default_nel, default_senter -from .functions import merge_subtokens -from ..language import Language, component -from ..syntax import nonproj -from ..gold.example import Example -from ..attrs import POS, ID -from ..util import link_vectors_to_models, create_default_optimizer -from ..parts_of_speech import X -from ..kb import KnowledgeBase -from ..errors import Errors, TempErrors, Warnings -from .. import util -from ..scorer import Scorer - - -def _load_cfg(path): - if path.exists(): - return srsly.read_json(path) - else: - return {} - - -class Pipe: - """This class is not instantiated directly. Components inherit from it, and - it defines the interface that components should follow to function as - components in a spaCy analysis pipeline. - """ - - name = None - - @classmethod - def from_nlp(cls, nlp, model, **cfg): - return cls(nlp.vocab, model, **cfg) - - def __init__(self, vocab, model, **cfg): - """Create a new pipe instance.""" - raise NotImplementedError - - def __call__(self, Doc doc): - """Apply the pipe to one document. The document is - modified in-place, and returned. - - Both __call__ and pipe should delegate to the `predict()` - and `set_annotations()` methods. - """ - scores = self.predict([doc]) - self.set_annotations([doc], scores) - return doc - - def pipe(self, stream, batch_size=128): - """Apply the pipe to a stream of documents. - - Both __call__ and pipe should delegate to the `predict()` - and `set_annotations()` methods. - """ - for docs in util.minibatch(stream, size=batch_size): - scores = self.predict(docs) - self.set_annotations(docs, scores) - yield from docs - - def predict(self, docs): - """Apply the pipeline's model to a batch of docs, without - modifying them. - """ - raise NotImplementedError - - def set_annotations(self, docs, scores): - """Modify a batch of documents, using pre-computed scores.""" - raise NotImplementedError - - def rehearse(self, examples, sgd=None, losses=None, **config): - pass - - def get_loss(self, examples, scores): - """Find the loss and gradient of loss for the batch of - examples (with embedded docs) and their predicted scores.""" - raise NotImplementedError - - def add_label(self, label): - """Add an output label, to be predicted by the model. - - It's possible to extend pretrained models with new labels, - but care should be taken to avoid the "catastrophic forgetting" - problem. - """ - raise NotImplementedError - - def create_optimizer(self): - return create_default_optimizer() - - def begin_training( - self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs - ): - """Initialize the pipe for training, using data exampes if available. - If no model has been initialized yet, the model is added.""" - self.model.initialize() - if hasattr(self, "vocab"): - link_vectors_to_models(self.vocab) - if sgd is None: - sgd = self.create_optimizer() - return sgd - - def set_output(self, nO): - if self.model.has_dim("nO") is not False: - self.model.set_dim("nO", nO) - if self.model.has_ref("output_layer"): - self.model.get_ref("output_layer").set_dim("nO", nO) - - def get_gradients(self): - """Get non-zero gradients of the model's parameters, as a dictionary - keyed by the parameter ID. The values are (weights, gradients) tuples. - """ - gradients = {} - queue = [self.model] - seen = set() - for node in queue: - if node.id in seen: - continue - seen.add(node.id) - if hasattr(node, "_mem") and node._mem.gradient.any(): - gradients[node.id] = [node._mem.weights, node._mem.gradient] - if hasattr(node, "_layers"): - queue.extend(node._layers) - return gradients - - def use_params(self, params): - """Modify the pipe's model, to use the given parameter values.""" - with self.model.use_params(params): - yield - - def score(self, examples, **kwargs): - return {} - - def to_bytes(self, exclude=tuple()): - """Serialize the pipe to a bytestring. - - exclude (list): String names of serialization fields to exclude. - RETURNS (bytes): The serialized object. - """ - serialize = {} - serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - serialize["model"] = self.model.to_bytes - if hasattr(self, "vocab"): - serialize["vocab"] = self.vocab.to_bytes - return util.to_bytes(serialize, exclude) - - def from_bytes(self, bytes_data, exclude=tuple()): - """Load the pipe from a bytestring.""" - - def load_model(b): - try: - self.model.from_bytes(b) - except AttributeError: - raise ValueError(Errors.E149) - - deserialize = {} - if hasattr(self, "vocab"): - deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) - deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) - deserialize["model"] = load_model - util.from_bytes(bytes_data, deserialize, exclude) - return self - - def to_disk(self, path, exclude=tuple()): - """Serialize the pipe to disk.""" - serialize = {} - serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) - serialize["vocab"] = lambda p: self.vocab.to_disk(p) - serialize["model"] = lambda p: self.model.to_disk(p) - util.to_disk(path, serialize, exclude) - - def from_disk(self, path, exclude=tuple()): - """Load the pipe from disk.""" - - def load_model(p): - try: - self.model.from_bytes(p.open("rb").read()) - except AttributeError: - raise ValueError(Errors.E149) - - deserialize = {} - deserialize["vocab"] = lambda p: self.vocab.from_disk(p) - deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) - deserialize["model"] = load_model - util.from_disk(path, deserialize, exclude) - return self - - -@component("tagger", assigns=["token.tag", "token.pos", "token.lemma"], default_model=default_tagger) -class Tagger(Pipe): - """Pipeline component for part-of-speech tagging. - - DOCS: https://spacy.io/api/tagger - """ - - def __init__(self, vocab, model, **cfg): - self.vocab = vocab - self.model = model - self._rehearsal_model = None - self.cfg = dict(sorted(cfg.items())) - - @property - def labels(self): - return tuple(self.vocab.morphology.tag_names) - - def __call__(self, doc): - tags = self.predict([doc]) - self.set_annotations([doc], tags) - return doc - - def pipe(self, stream, batch_size=128): - for docs in util.minibatch(stream, size=batch_size): - tag_ids = self.predict(docs) - self.set_annotations(docs, tag_ids) - yield from docs - - def predict(self, docs): - if not any(len(doc) for doc in docs): - # Handle cases where there are no tokens in any docs. - n_labels = len(self.labels) - guesses = [self.model.ops.alloc((0, n_labels)) for doc in docs] - assert len(guesses) == len(docs) - return guesses - scores = self.model.predict(docs) - assert len(scores) == len(docs), (len(scores), len(docs)) - guesses = self._scores2guesses(scores) - assert len(guesses) == len(docs) - return guesses - - def _scores2guesses(self, scores): - guesses = [] - for doc_scores in scores: - doc_guesses = doc_scores.argmax(axis=1) - if not isinstance(doc_guesses, numpy.ndarray): - doc_guesses = doc_guesses.get() - guesses.append(doc_guesses) - return guesses - - def set_annotations(self, docs, batch_tag_ids): - if isinstance(docs, Doc): - docs = [docs] - cdef Doc doc - cdef int idx = 0 - cdef Vocab vocab = self.vocab - assign_morphology = self.cfg.get("set_morphology", True) - for i, doc in enumerate(docs): - doc_tag_ids = batch_tag_ids[i] - if hasattr(doc_tag_ids, "get"): - doc_tag_ids = doc_tag_ids.get() - for j, tag_id in enumerate(doc_tag_ids): - # Don't clobber preset POS tags - if doc.c[j].tag == 0: - if doc.c[j].pos == 0 and assign_morphology: - # Don't clobber preset lemmas - lemma = doc.c[j].lemma - vocab.morphology.assign_tag_id(&doc.c[j], tag_id) - if lemma != 0 and lemma != doc.c[j].lex.orth: - doc.c[j].lemma = lemma - else: - doc.c[j].tag = self.vocab.strings[self.labels[tag_id]] - idx += 1 - doc.is_tagged = True - - def update(self, examples, *, drop=0., sgd=None, losses=None, set_annotations=False): - if losses is None: - losses = {} - losses.setdefault(self.name, 0.0) - - try: - if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): - # Handle cases where there are no tokens in any docs. - return - except AttributeError: - types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types)) - set_dropout_rate(self.model, drop) - tag_scores, bp_tag_scores = self.model.begin_update( - [eg.predicted for eg in examples]) - for sc in tag_scores: - if self.model.ops.xp.isnan(sc.sum()): - raise ValueError("nan value in scores") - loss, d_tag_scores = self.get_loss(examples, tag_scores) - bp_tag_scores(d_tag_scores) - if sgd not in (None, False): - self.model.finish_update(sgd) - - losses[self.name] += loss - if set_annotations: - docs = [eg.predicted for eg in examples] - self.set_annotations(docs, self._scores2guesses(tag_scores)) - return losses - - def rehearse(self, examples, drop=0., sgd=None, losses=None): - """Perform a 'rehearsal' update, where we try to match the output of - an initial model. - """ - try: - docs = [eg.predicted for eg in examples] - except AttributeError: - types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) - if self._rehearsal_model is None: - return - if not any(len(doc) for doc in docs): - # Handle cases where there are no tokens in any docs. - return - set_dropout_rate(self.model, drop) - guesses, backprop = self.model.begin_update(docs) - target = self._rehearsal_model(examples) - gradient = guesses - target - backprop(gradient) - self.model.finish_update(sgd) - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += (gradient**2).sum() - - def get_loss(self, examples, scores): - loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) - truths = [eg.get_aligned("tag", as_string=True) for eg in examples] - d_scores, loss = loss_func(scores, truths) - if self.model.ops.xp.isnan(loss): - raise ValueError("nan value when computing loss") - return float(loss), d_scores - - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, - **kwargs): - lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] - if not any(table in self.vocab.lookups for table in lemma_tables): - warnings.warn(Warnings.W022) - if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0: - warnings.warn(Warnings.W033.format(model="part-of-speech tagger")) - orig_tag_map = dict(self.vocab.morphology.tag_map) - new_tag_map = {} - for example in get_examples(): - try: - y = example.y - except AttributeError: - raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) - for token in y: - tag = token.tag_ - if tag in orig_tag_map: - new_tag_map[tag] = orig_tag_map[tag] - else: - new_tag_map[tag] = {POS: X} - - cdef Vocab vocab = self.vocab - if new_tag_map: - if "_SP" in orig_tag_map: - new_tag_map["_SP"] = orig_tag_map["_SP"] - vocab.morphology = Morphology(vocab.strings, new_tag_map, - vocab.morphology.lemmatizer, - exc=vocab.morphology.exc) - self.set_output(len(self.labels)) - doc_sample = [Doc(self.vocab, words=["hello", "world"])] - if pipeline is not None: - for name, component in pipeline: - if component is self: - break - if hasattr(component, "pipe"): - doc_sample = list(component.pipe(doc_sample)) - else: - doc_sample = [component(doc) for doc in doc_sample] - self.model.initialize(X=doc_sample) - # Get batch of example docs, example outputs to call begin_training(). - # This lets the model infer shapes. - link_vectors_to_models(self.vocab) - if sgd is None: - sgd = self.create_optimizer() - return sgd - - def add_label(self, label, values=None): - if not isinstance(label, str): - raise ValueError(Errors.E187) - if label in self.labels: - return 0 - if self.model.has_dim("nO"): - # Here's how the model resizing will work, once the - # neuron-to-tag mapping is no longer controlled by - # the Morphology class, which sorts the tag names. - # The sorting makes adding labels difficult. - # smaller = self.model._layers[-1] - # larger = Softmax(len(self.labels)+1, smaller.nI) - # copy_array(larger.W[:smaller.nO], smaller.W) - # copy_array(larger.b[:smaller.nO], smaller.b) - # self.model._layers[-1] = larger - raise ValueError(TempErrors.T003) - tag_map = dict(self.vocab.morphology.tag_map) - if values is None: - values = {POS: "X"} - tag_map[label] = values - self.vocab.morphology = Morphology( - self.vocab.strings, tag_map=tag_map, - lemmatizer=self.vocab.morphology.lemmatizer, - exc=self.vocab.morphology.exc) - return 1 - - def use_params(self, params): - with self.model.use_params(params): - yield - - def score(self, examples, **kwargs): - scores = {} - scores.update(Scorer.score_token_attr(examples, "tag", **kwargs)) - scores.update(Scorer.score_token_attr(examples, "pos", **kwargs)) - scores.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) - return scores - - def to_bytes(self, exclude=tuple()): - serialize = {} - serialize["model"] = self.model.to_bytes - serialize["vocab"] = self.vocab.to_bytes - serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) - serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) - return util.to_bytes(serialize, exclude) - - def from_bytes(self, bytes_data, exclude=tuple()): - def load_model(b): - try: - self.model.from_bytes(b) - except AttributeError: - raise ValueError(Errors.E149) - - def load_tag_map(b): - tag_map = srsly.msgpack_loads(b) - self.vocab.morphology = Morphology( - self.vocab.strings, tag_map=tag_map, - lemmatizer=self.vocab.morphology.lemmatizer, - exc=self.vocab.morphology.exc) - - deserialize = { - "vocab": lambda b: self.vocab.from_bytes(b), - "tag_map": load_tag_map, - "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), - "model": lambda b: load_model(b), - } - util.from_bytes(bytes_data, deserialize, exclude) - return self - - def to_disk(self, path, exclude=tuple()): - tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) - serialize = { - "vocab": lambda p: self.vocab.to_disk(p), - "tag_map": lambda p: srsly.write_msgpack(p, tag_map), - "model": lambda p: self.model.to_disk(p), - "cfg": lambda p: srsly.write_json(p, self.cfg), - } - util.to_disk(path, serialize, exclude) - - def from_disk(self, path, exclude=tuple()): - def load_model(p): - with p.open("rb") as file_: - try: - self.model.from_bytes(file_.read()) - except AttributeError: - raise ValueError(Errors.E149) - - def load_tag_map(p): - tag_map = srsly.read_msgpack(p) - self.vocab.morphology = Morphology( - self.vocab.strings, tag_map=tag_map, - lemmatizer=self.vocab.morphology.lemmatizer, - exc=self.vocab.morphology.exc) - - deserialize = { - "vocab": lambda p: self.vocab.from_disk(p), - "cfg": lambda p: self.cfg.update(_load_cfg(p)), - "tag_map": load_tag_map, - "model": load_model, - } - util.from_disk(path, deserialize, exclude) - return self - - -@component("senter", assigns=["token.is_sent_start"], default_model=default_senter) -class SentenceRecognizer(Tagger): - """Pipeline component for sentence segmentation. - - DOCS: https://spacy.io/api/sentencerecognizer - """ - - def __init__(self, vocab, model, **cfg): - self.vocab = vocab - self.model = model - self._rehearsal_model = None - self.cfg = dict(sorted(cfg.items())) - - @property - def labels(self): - # labels are numbered by index internally, so this matches GoldParse - # and Example where the sentence-initial tag is 1 and other positions - # are 0 - return tuple(["I", "S"]) - - def set_annotations(self, docs, batch_tag_ids): - if isinstance(docs, Doc): - docs = [docs] - cdef Doc doc - for i, doc in enumerate(docs): - doc_tag_ids = batch_tag_ids[i] - if hasattr(doc_tag_ids, "get"): - doc_tag_ids = doc_tag_ids.get() - for j, tag_id in enumerate(doc_tag_ids): - # Don't clobber existing sentence boundaries - if doc.c[j].sent_start == 0: - if tag_id == 1: - doc.c[j].sent_start = 1 - else: - doc.c[j].sent_start = -1 - - def get_loss(self, examples, scores): - labels = self.labels - loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False) - truths = [] - for eg in examples: - eg_truth = [] - for x in eg.get_aligned("sent_start"): - if x == None: - eg_truth.append(None) - elif x == 1: - eg_truth.append(labels[1]) - else: - # anything other than 1: 0, -1, -1 as uint64 - eg_truth.append(labels[0]) - truths.append(eg_truth) - d_scores, loss = loss_func(scores, truths) - if self.model.ops.xp.isnan(loss): - raise ValueError("nan value when computing loss") - return float(loss), d_scores - - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, - **kwargs): - self.set_output(len(self.labels)) - self.model.initialize() - link_vectors_to_models(self.vocab) - if sgd is None: - sgd = self.create_optimizer() - return sgd - - def add_label(self, label, values=None): - raise NotImplementedError - - def score(self, examples, **kwargs): - return Scorer.score_spans(examples, "sents", **kwargs) - - def to_bytes(self, exclude=tuple()): - serialize = {} - serialize["model"] = self.model.to_bytes - serialize["vocab"] = self.vocab.to_bytes - serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - return util.to_bytes(serialize, exclude) - - def from_bytes(self, bytes_data, exclude=tuple()): - def load_model(b): - try: - self.model.from_bytes(b) - except AttributeError: - raise ValueError(Errors.E149) - - deserialize = { - "vocab": lambda b: self.vocab.from_bytes(b), - "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), - "model": lambda b: load_model(b), - } - util.from_bytes(bytes_data, deserialize, exclude) - return self - - def to_disk(self, path, exclude=tuple()): - serialize = { - "vocab": lambda p: self.vocab.to_disk(p), - "model": lambda p: p.open("wb").write(self.model.to_bytes()), - "cfg": lambda p: srsly.write_json(p, self.cfg), - } - util.to_disk(path, serialize, exclude) - - def from_disk(self, path, exclude=tuple()): - def load_model(p): - with p.open("rb") as file_: - try: - self.model.from_bytes(file_.read()) - except AttributeError: - raise ValueError(Errors.E149) - - deserialize = { - "vocab": lambda p: self.vocab.from_disk(p), - "cfg": lambda p: self.cfg.update(_load_cfg(p)), - "model": load_model, - } - util.from_disk(path, deserialize, exclude) - return self - - -@component("nn_labeller") -class MultitaskObjective(Tagger): - """Experimental: Assist training of a parser or tagger, by training a - side-objective. - """ - - def __init__(self, vocab, model, **cfg): - self.vocab = vocab - self.model = model - target = cfg["target"] # default: 'dep_tag_offset' - if target == "dep": - self.make_label = self.make_dep - elif target == "tag": - self.make_label = self.make_tag - elif target == "ent": - self.make_label = self.make_ent - elif target == "dep_tag_offset": - self.make_label = self.make_dep_tag_offset - elif target == "ent_tag": - self.make_label = self.make_ent_tag - elif target == "sent_start": - self.make_label = self.make_sent_start - elif hasattr(target, "__call__"): - self.make_label = target - else: - raise ValueError(Errors.E016) - self.cfg = dict(cfg) - - @property - def labels(self): - return self.cfg.setdefault("labels", {}) - - @labels.setter - def labels(self, value): - self.cfg["labels"] = value - - def set_annotations(self, docs, dep_ids): - pass - - def begin_training(self, get_examples=lambda: [], pipeline=None, - sgd=None, **kwargs): - gold_examples = nonproj.preprocess_training_data(get_examples()) - # for raw_text, doc_annot in gold_tuples: - for example in gold_examples: - for token in example.y: - label = self.make_label(token) - if label is not None and label not in self.labels: - self.labels[label] = len(self.labels) - self.model.initialize() - link_vectors_to_models(self.vocab) - if sgd is None: - sgd = self.create_optimizer() - return sgd - - def predict(self, docs): - tokvecs = self.model.get_ref("tok2vec")(docs) - scores = self.model.get_ref("softmax")(tokvecs) - return tokvecs, scores - - def get_loss(self, examples, scores): - cdef int idx = 0 - correct = numpy.zeros((scores.shape[0],), dtype="i") - guesses = scores.argmax(axis=1) - docs = [eg.predicted for eg in examples] - for i, eg in enumerate(examples): - # Handles alignment for tokenization differences - doc_annots = eg.get_aligned() # TODO - for j in range(len(eg.predicted)): - tok_annots = {key: values[j] for key, values in tok_annots.items()} - label = self.make_label(j, tok_annots) - if label is None or label not in self.labels: - correct[idx] = guesses[idx] - else: - correct[idx] = self.labels[label] - idx += 1 - correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) - loss = (d_scores**2).sum() - return float(loss), d_scores - - @staticmethod - def make_dep(token): - return token.dep_ - - @staticmethod - def make_tag(token): - return token.tag_ - - @staticmethod - def make_ent(token): - if token.ent_iob_ == "O": - return "O" - else: - return token.ent_iob_ + "-" + token.ent_type_ - - @staticmethod - def make_dep_tag_offset(token): - dep = token.dep_ - tag = token.tag_ - offset = token.head.i - token.i - offset = min(offset, 2) - offset = max(offset, -2) - return f"{dep}-{tag}:{offset}" - - @staticmethod - def make_ent_tag(token): - if token.ent_iob_ == "O": - ent = "O" - else: - ent = token.ent_iob_ + "-" + token.ent_type_ - tag = token.tag_ - return f"{tag}-{ent}" - - @staticmethod - def make_sent_start(token): - """A multi-task objective for representing sentence boundaries, - using BILU scheme. (O is impossible) - """ - if token.is_sent_start and token.is_sent_end: - return "U-SENT" - elif token.is_sent_start: - return "B-SENT" - else: - return "I-SENT" - - -class ClozeMultitask(Pipe): - def __init__(self, vocab, model, **cfg): - self.vocab = vocab - self.model = model - self.cfg = cfg - self.distance = CosineDistance(ignore_zeros=True, normalize=False) # TODO: in config - - def set_annotations(self, docs, dep_ids): - pass - - def begin_training(self, get_examples=lambda: [], pipeline=None, - sgd=None, **kwargs): - link_vectors_to_models(self.vocab) - self.model.initialize() - X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) - self.model.output_layer.begin_training(X) - if sgd is None: - sgd = self.create_optimizer() - return sgd - - def predict(self, docs): - tokvecs = self.model.get_ref("tok2vec")(docs) - vectors = self.model.get_ref("output_layer")(tokvecs) - return tokvecs, vectors - - def get_loss(self, examples, vectors, prediction): - # The simplest way to implement this would be to vstack the - # token.vector values, but that's a bit inefficient, especially on GPU. - # Instead we fetch the index into the vectors table for each of our tokens, - # and look them up all at once. This prevents data copying. - ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples]) - target = vectors[ids] - gradient = self.distance.get_grad(prediction, target) - loss = self.distance.get_loss(prediction, target) - return loss, gradient - - def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None): - pass - - def rehearse(self, examples, drop=0., sgd=None, losses=None): - if losses is not None and self.name not in losses: - losses[self.name] = 0. - set_dropout_rate(self.model, drop) - try: - predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples]) - except AttributeError: - types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types)) - loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) - bp_predictions(d_predictions) - if sgd is not None: - self.model.finish_update(sgd) - - if losses is not None: - losses[self.name] += loss - - -@component("textcat", assigns=["doc.cats"], default_model=default_textcat) -class TextCategorizer(Pipe): - """Pipeline component for text classification. - - DOCS: https://spacy.io/api/textcategorizer - """ - def __init__(self, vocab, model, **cfg): - self.vocab = vocab - self.model = model - self._rehearsal_model = None - self.cfg = dict(cfg) - - @property - def labels(self): - return tuple(self.cfg.setdefault("labels", [])) - - def require_labels(self): - """Raise an error if the component's model has no labels defined.""" - if not self.labels: - raise ValueError(Errors.E143.format(name=self.name)) - - @labels.setter - def labels(self, value): - self.cfg["labels"] = tuple(value) - - def pipe(self, stream, batch_size=128): - for docs in util.minibatch(stream, size=batch_size): - scores = self.predict(docs) - self.set_annotations(docs, scores) - yield from docs - - def predict(self, docs): - tensors = [doc.tensor for doc in docs] - - if not any(len(doc) for doc in docs): - # Handle cases where there are no tokens in any docs. - xp = get_array_module(tensors) - scores = xp.zeros((len(docs), len(self.labels))) - return scores - - scores = self.model.predict(docs) - scores = self.model.ops.asarray(scores) - return scores - - def set_annotations(self, docs, scores): - for i, doc in enumerate(docs): - for j, label in enumerate(self.labels): - doc.cats[label] = float(scores[i, j]) - - def update(self, examples, *, drop=0., set_annotations=False, sgd=None, losses=None): - if losses is None: - losses = {} - losses.setdefault(self.name, 0.0) - try: - if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): - # Handle cases where there are no tokens in any docs. - return losses - except AttributeError: - types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=types)) - set_dropout_rate(self.model, drop) - scores, bp_scores = self.model.begin_update( - [eg.predicted for eg in examples] - ) - loss, d_scores = self.get_loss(examples, scores) - bp_scores(d_scores) - if sgd is not None: - self.model.finish_update(sgd) - losses[self.name] += loss - if set_annotations: - docs = [eg.predicted for eg in examples] - self.set_annotations(docs, scores=scores) - return losses - - def rehearse(self, examples, drop=0., sgd=None, losses=None): - if self._rehearsal_model is None: - return - try: - docs = [eg.predicted for eg in examples] - except AttributeError: - types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="TextCategorizer", method="rehearse", types=types)) - if not any(len(doc) for doc in docs): - # Handle cases where there are no tokens in any docs. - return - set_dropout_rate(self.model, drop) - scores, bp_scores = self.model.begin_update(docs) - target = self._rehearsal_model(examples) - gradient = scores - target - bp_scores(gradient) - if sgd is not None: - self.model.finish_update(sgd) - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += (gradient**2).sum() - - def _examples_to_truth(self, examples): - truths = numpy.zeros((len(examples), len(self.labels)), dtype="f") - not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f") - for i, eg in enumerate(examples): - for j, label in enumerate(self.labels): - if label in eg.reference.cats: - truths[i, j] = eg.reference.cats[label] - else: - not_missing[i, j] = 0. - truths = self.model.ops.asarray(truths) - return truths, not_missing - - def get_loss(self, examples, scores): - truths, not_missing = self._examples_to_truth(examples) - not_missing = self.model.ops.asarray(not_missing) - d_scores = (scores-truths) / scores.shape[0] - d_scores *= not_missing - mean_square_error = (d_scores**2).sum(axis=1).mean() - return float(mean_square_error), d_scores - - def add_label(self, label): - if not isinstance(label, str): - raise ValueError(Errors.E187) - if label in self.labels: - return 0 - if self.model.has_dim("nO"): - # This functionality was available previously, but was broken. - # The problem is that we resize the last layer, but the last layer - # is actually just an ensemble. We're not resizing the child layers - # - a huge problem. - raise ValueError(Errors.E116) - # smaller = self.model._layers[-1] - # larger = Linear(len(self.labels)+1, smaller.nI) - # copy_array(larger.W[:smaller.nO], smaller.W) - # copy_array(larger.b[:smaller.nO], smaller.b) - # self.model._layers[-1] = larger - self.labels = tuple(list(self.labels) + [label]) - return 1 - - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): - # TODO: begin_training is not guaranteed to see all data / labels ? - examples = list(get_examples()) - for example in examples: - try: - y = example.y - except AttributeError: - raise TypeError(Errors.E978.format(name="TextCategorizer", method="update", types=type(example))) - for cat in y.cats: - self.add_label(cat) - self.require_labels() - docs = [Doc(Vocab(), words=["hello"])] - truths, _ = self._examples_to_truth(examples) - self.set_output(len(self.labels)) - link_vectors_to_models(self.vocab) - self.model.initialize(X=docs, Y=truths) - if sgd is None: - sgd = self.create_optimizer() - return sgd - - def score(self, examples, **kwargs): - return Scorer.score_cats(examples, "cats", labels=self.labels, - multi_label=self.model.attrs["multi_label"], - positive_label=self.cfg.get("positive_label", None), - **kwargs - ) - - -cdef class DependencyParser(Parser): - """Pipeline component for dependency parsing. - - DOCS: https://spacy.io/api/dependencyparser - """ - # cdef classes can't have decorators, so we're defining this here - name = "parser" - factory = "parser" - assigns = ["token.dep", "token.is_sent_start", "doc.sents"] - requires = [] - TransitionSystem = ArcEager - - @property - def postprocesses(self): - output = [nonproj.deprojectivize] - if self.cfg.get("learn_tokens") is True: - output.append(merge_subtokens) - return tuple(output) - - def add_multitask_objective(self, mt_component): - self._multitasks.append(mt_component) - - def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): - # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? - for labeller in self._multitasks: - labeller.model.set_dim("nO", len(self.labels)) - if labeller.model.has_ref("output_layer"): - labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) - labeller.begin_training(get_examples, pipeline=pipeline, sgd=sgd) - - def __reduce__(self): - return (DependencyParser, (self.vocab, self.model), (self.moves, self.cfg)) - - def __getstate__(self): - return (self.moves, self.cfg) - - def __setstate__(self, state): - moves, config = state - self.moves = moves - self.cfg = config - - @property - def labels(self): - labels = set() - # Get the labels from the model by looking at the available moves - for move in self.move_names: - if "-" in move: - label = move.split("-")[1] - if "||" in label: - label = label.split("||")[1] - labels.add(label) - return tuple(sorted(labels)) - - def score(self, examples, **kwargs): - def dep_getter(token, attr): - dep = getattr(token, attr) - dep = token.vocab.strings.as_string(dep).lower() - return dep - results = {} - results.update(Scorer.score_spans(examples, "sents", **kwargs)) - results.update(Scorer.score_deps(examples, "dep", getter=dep_getter, - ignore_labels=("p", "punct"), **kwargs)) - return results - - -cdef class EntityRecognizer(Parser): - """Pipeline component for named entity recognition. - - DOCS: https://spacy.io/api/entityrecognizer - """ - name = "ner" - factory = "ner" - assigns = ["doc.ents", "token.ent_iob", "token.ent_type"] - requires = [] - TransitionSystem = BiluoPushDown - - def add_multitask_objective(self, mt_component): - self._multitasks.append(mt_component) - - def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg): - # TODO: transfer self.model.get_ref("tok2vec") to the multitask's model ? - for labeller in self._multitasks: - labeller.model.set_dim("nO", len(self.labels)) - if labeller.model.has_ref("output_layer"): - labeller.model.get_ref("output_layer").set_dim("nO", len(self.labels)) - labeller.begin_training(get_examples, pipeline=pipeline) - - def __reduce__(self): - return (EntityRecognizer, (self.vocab, self.model), (self.moves, self.cfg)) - - def __getstate__(self): - return self.moves, self.cfg - - def __setstate__(self, state): - moves, config = state - self.moves = moves - self.cfg = config - - @property - def labels(self): - # Get the labels from the model by looking at the available moves, e.g. - # B-PERSON, I-PERSON, L-PERSON, U-PERSON - labels = set(move.split("-")[1] for move in self.move_names - if move[0] in ("B", "I", "L", "U")) - return tuple(sorted(labels)) - - def score(self, examples, **kwargs): - return Scorer.score_spans(examples, "ents", **kwargs) - -@component( - "entity_linker", - requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], - assigns=["token.ent_kb_id"], - default_model=default_nel, -) -class EntityLinker(Pipe): - """Pipeline component for named entity linking. - - DOCS: https://spacy.io/api/entitylinker - """ - NIL = "NIL" # string used to refer to a non-existing link - - def __init__(self, vocab, model, **cfg): - self.vocab = vocab - self.model = model - self.kb = None - self.kb = cfg.get("kb", None) - if self.kb is None: - # create an empty KB that should be filled by calling from_disk - self.kb = KnowledgeBase(vocab=vocab) - else: - del cfg["kb"] # we don't want to duplicate its serialization - if not isinstance(self.kb, KnowledgeBase): - raise ValueError(Errors.E990.format(type=type(self.kb))) - self.cfg = dict(cfg) - self.distance = CosineDistance(normalize=False) - # how many neightbour sentences to take into account - self.n_sents = cfg.get("n_sents", 0) - - def require_kb(self): - # Raise an error if the knowledge base is not initialized. - if len(self.kb) == 0: - raise ValueError(Errors.E139.format(name=self.name)) - - def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs): - self.require_kb() - nO = self.kb.entity_vector_length - self.set_output(nO) - self.model.initialize() - if sgd is None: - sgd = self.create_optimizer() - return sgd - - def update(self, examples, *, set_annotations=False, drop=0.0, sgd=None, losses=None): - self.require_kb() - if losses is None: - losses = {} - losses.setdefault(self.name, 0.0) - if not examples: - return losses - sentence_docs = [] - try: - docs = [eg.predicted for eg in examples] - except AttributeError: - types = set([type(eg) for eg in examples]) - raise TypeError(Errors.E978.format(name="EntityLinker", method="update", types=types)) - if set_annotations: - # This seems simpler than other ways to get that exact output -- but - # it does run the model twice :( - predictions = self.model.predict(docs) - - for eg in examples: - sentences = [s for s in eg.predicted.sents] - kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) - for ent in eg.predicted.ents: - kb_id = kb_ids[ent.start] # KB ID of the first token is the same as the whole span - if kb_id: - try: - # find the sentence in the list of sentences. - sent_index = sentences.index(ent.sent) - except AttributeError: - # Catch the exception when ent.sent is None and provide a user-friendly warning - raise RuntimeError(Errors.E030) - # get n previous sentences, if there are any - start_sentence = max(0, sent_index - self.n_sents) - - # get n posterior sentences, or as many < n as there are - end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - - # get token positions - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end - - # append that span as a doc to training - sent_doc = eg.predicted[start_token:end_token].as_doc() - sentence_docs.append(sent_doc) - set_dropout_rate(self.model, drop) - if not sentence_docs: - warnings.warn(Warnings.W093.format(name="Entity Linker")) - return 0.0 - sentence_encodings, bp_context = self.model.begin_update(sentence_docs) - loss, d_scores = self.get_similarity_loss( - sentence_encodings=sentence_encodings, - examples=examples - ) - bp_context(d_scores) - if sgd is not None: - self.model.finish_update(sgd) - - losses[self.name] += loss - if set_annotations: - self.set_annotations(docs, predictions) - return losses - - def get_similarity_loss(self, examples, sentence_encodings): - entity_encodings = [] - for eg in examples: - kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True) - for ent in eg.predicted.ents: - kb_id = kb_ids[ent.start] - if kb_id: - entity_encoding = self.kb.get_vector(kb_id) - entity_encodings.append(entity_encoding) - - entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") - - if sentence_encodings.shape != entity_encodings.shape: - raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up")) - - gradients = self.distance.get_grad(sentence_encodings, entity_encodings) - loss = self.distance.get_loss(sentence_encodings, entity_encodings) - loss = loss / len(entity_encodings) - return loss, gradients - - def __call__(self, doc): - kb_ids = self.predict([doc]) - self.set_annotations([doc], kb_ids) - return doc - - def pipe(self, stream, batch_size=128): - for docs in util.minibatch(stream, size=batch_size): - kb_ids = self.predict(docs) - self.set_annotations(docs, kb_ids) - yield from docs - - def predict(self, docs): - """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ - self.require_kb() - entity_count = 0 - final_kb_ids = [] - - if not docs: - return final_kb_ids - - if isinstance(docs, Doc): - docs = [docs] - - for i, doc in enumerate(docs): - sentences = [s for s in doc.sents] - - if len(doc) > 0: - # Looping through each sentence and each entity - # This may go wrong if there are entities across sentences - which shouldn't happen normally. - for sent_index, sent in enumerate(sentences): - if sent.ents: - # get n_neightbour sentences, clipped to the length of the document - start_sentence = max(0, sent_index - self.n_sents) - end_sentence = min(len(sentences) -1, sent_index + self.n_sents) - - start_token = sentences[start_sentence].start - end_token = sentences[end_sentence].end - - sent_doc = doc[start_token:end_token].as_doc() - # currently, the context is the same for each entity in a sentence (should be refined) - sentence_encoding = self.model.predict([sent_doc])[0] - xp = get_array_module(sentence_encoding) - sentence_encoding_t = sentence_encoding.T - sentence_norm = xp.linalg.norm(sentence_encoding_t) - - for ent in sent.ents: - entity_count += 1 - - to_discard = self.cfg.get("labels_discard", []) - if to_discard and ent.label_ in to_discard: - # ignoring this entity - setting to NIL - final_kb_ids.append(self.NIL) - - else: - candidates = self.kb.get_candidates(ent.text) - if not candidates: - # no prediction possible for this entity - setting to NIL - final_kb_ids.append(self.NIL) - - elif len(candidates) == 1: - # shortcut for efficiency reasons: take the 1 candidate - - # TODO: thresholding - final_kb_ids.append(candidates[0].entity_) - - else: - random.shuffle(candidates) - - # this will set all prior probabilities to 0 if they should be excluded from the model - prior_probs = xp.asarray([c.prior_prob for c in candidates]) - if not self.cfg.get("incl_prior", True): - prior_probs = xp.asarray([0.0 for c in candidates]) - scores = prior_probs - - # add in similarity from the context - if self.cfg.get("incl_context", True): - entity_encodings = xp.asarray([c.entity_vector for c in candidates]) - entity_norm = xp.linalg.norm(entity_encodings, axis=1) - - if len(entity_encodings) != len(prior_probs): - raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) - - # cosine similarity - sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm) - if sims.shape != prior_probs.shape: - raise ValueError(Errors.E161) - scores = prior_probs + sims - (prior_probs*sims) - - # TODO: thresholding - best_index = scores.argmax().item() - best_candidate = candidates[best_index] - final_kb_ids.append(best_candidate.entity_) - - if not (len(final_kb_ids) == entity_count): - raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length")) - - return final_kb_ids - - def set_annotations(self, docs, kb_ids): - count_ents = len([ent for doc in docs for ent in doc.ents]) - if count_ents != len(kb_ids): - raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids))) - - i=0 - for doc in docs: - for ent in doc.ents: - kb_id = kb_ids[i] - i += 1 - for token in ent: - token.ent_kb_id_ = kb_id - - def to_disk(self, path, exclude=tuple()): - serialize = {} - self.cfg["entity_width"] = self.kb.entity_vector_length - serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) - serialize["vocab"] = lambda p: self.vocab.to_disk(p) - serialize["kb"] = lambda p: self.kb.dump(p) - serialize["model"] = lambda p: self.model.to_disk(p) - util.to_disk(path, serialize, exclude) - - def from_disk(self, path, exclude=tuple()): - def load_model(p): - try: - self.model.from_bytes(p.open("rb").read()) - except AttributeError: - raise ValueError(Errors.E149) - - def load_kb(p): - self.kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"]) - self.kb.load_bulk(p) - - deserialize = {} - deserialize["vocab"] = lambda p: self.vocab.from_disk(p) - deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) - deserialize["kb"] = load_kb - deserialize["model"] = load_model - util.from_disk(path, deserialize, exclude) - return self - - def rehearse(self, examples, sgd=None, losses=None, **config): - raise NotImplementedError - - def add_label(self, label): - raise NotImplementedError - - -@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"]) -class Sentencizer(Pipe): - """Segment the Doc into sentences using a rule-based strategy. - - DOCS: https://spacy.io/api/sentencizer - """ - - default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', - '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', - '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', - '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', - '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', - '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', - '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', - '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', - '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', - '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', - '。', '。'] - - def __init__(self, punct_chars=None, **kwargs): - """Initialize the sentencizer. - - punct_chars (list): Punctuation characters to split on. Will be - serialized with the nlp object. - RETURNS (Sentencizer): The sentencizer component. - - DOCS: https://spacy.io/api/sentencizer#init - """ - if punct_chars: - self.punct_chars = set(punct_chars) - else: - self.punct_chars = set(self.default_punct_chars) - - @classmethod - def from_nlp(cls, nlp, model=None, **cfg): - return cls(**cfg) - - def begin_training( - self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs - ): - pass - - def __call__(self, doc): - """Apply the sentencizer to a Doc and set Token.is_sent_start. - - example (Doc or Example): The document to process. - RETURNS (Doc or Example): The processed Doc or Example. - - DOCS: https://spacy.io/api/sentencizer#call - """ - start = 0 - seen_period = False - for i, token in enumerate(doc): - is_in_punct_chars = token.text in self.punct_chars - token.is_sent_start = i == 0 - if seen_period and not token.is_punct and not is_in_punct_chars: - doc[start].is_sent_start = True - start = token.i - seen_period = False - elif is_in_punct_chars: - seen_period = True - if start < len(doc): - doc[start].is_sent_start = True - return doc - - def pipe(self, stream, batch_size=128): - for docs in util.minibatch(stream, size=batch_size): - predictions = self.predict(docs) - self.set_annotations(docs, predictions) - yield from docs - - def predict(self, docs): - """Apply the pipeline's model to a batch of docs, without - modifying them. - """ - if not any(len(doc) for doc in docs): - # Handle cases where there are no tokens in any docs. - guesses = [[] for doc in docs] - return guesses - guesses = [] - for doc in docs: - doc_guesses = [False] * len(doc) - if len(doc) > 0: - start = 0 - seen_period = False - doc_guesses[0] = True - for i, token in enumerate(doc): - is_in_punct_chars = token.text in self.punct_chars - if seen_period and not token.is_punct and not is_in_punct_chars: - doc_guesses[start] = True - start = token.i - seen_period = False - elif is_in_punct_chars: - seen_period = True - if start < len(doc): - doc_guesses[start] = True - guesses.append(doc_guesses) - return guesses - - def set_annotations(self, docs, batch_tag_ids): - if isinstance(docs, Doc): - docs = [docs] - cdef Doc doc - cdef int idx = 0 - for i, doc in enumerate(docs): - doc_tag_ids = batch_tag_ids[i] - for j, tag_id in enumerate(doc_tag_ids): - # Don't clobber existing sentence boundaries - if doc.c[j].sent_start == 0: - if tag_id: - doc.c[j].sent_start = 1 - else: - doc.c[j].sent_start = -1 - - def score(self, examples, **kwargs): - return Scorer.score_spans(examples, "sents", **kwargs) - - def to_bytes(self, **kwargs): - """Serialize the sentencizer to a bytestring. - - RETURNS (bytes): The serialized object. - - DOCS: https://spacy.io/api/sentencizer#to_bytes - """ - return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)}) - - def from_bytes(self, bytes_data, **kwargs): - """Load the sentencizer from a bytestring. - - bytes_data (bytes): The data to load. - returns (Sentencizer): The loaded object. - - DOCS: https://spacy.io/api/sentencizer#from_bytes - """ - cfg = srsly.msgpack_loads(bytes_data) - self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) - return self - - def to_disk(self, path, exclude=tuple(), **kwargs): - """Serialize the sentencizer to disk. - - DOCS: https://spacy.io/api/sentencizer#to_disk - """ - path = util.ensure_path(path) - path = path.with_suffix(".json") - srsly.write_json(path, {"punct_chars": list(self.punct_chars)}) - - - def from_disk(self, path, exclude=tuple(), **kwargs): - """Load the sentencizer from disk. - - DOCS: https://spacy.io/api/sentencizer#from_disk - """ - path = util.ensure_path(path) - path = path.with_suffix(".json") - cfg = srsly.read_json(path) - self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) - return self - - -# Cython classes can't be decorated, so we need to add the factories here -Language.factories["parser"] = lambda nlp, model, **cfg: parser_factory(nlp, model, **cfg) -Language.factories["ner"] = lambda nlp, model, **cfg: ner_factory(nlp, model, **cfg) - -def parser_factory(nlp, model, **cfg): - default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} - if model is None: - model = default_parser() - warnings.warn(Warnings.W098.format(name="parser")) - for key, value in default_config.items(): - if key not in cfg: - cfg[key] = value - return DependencyParser.from_nlp(nlp, model, **cfg) - -def ner_factory(nlp, model, **cfg): - default_config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} - if model is None: - model = default_ner() - warnings.warn(Warnings.W098.format(name="ner")) - for key, value in default_config.items(): - if key not in cfg: - cfg[key] = value - return EntityRecognizer.from_nlp(nlp, model, **cfg) - -__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"]