💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx * Merge some components back together * Fix typo
2025-10-28 06:31:12 +03:00 · 2019-02-10 12:14:51 +01:00 · 2019-02-10 12:14:51 +01:00 · a9f8d17632
commit a9f8d17632
parent e7593b791e
7 changed files with 400 additions and 360 deletions
--- a/setup.py
+++ b/setup.py
@ -41,7 +41,7 @@ MOD_NAMES = [
    "spacy.vocab",
    "spacy.attrs",
    "spacy.morphology",
-    "spacy.pipeline",
+    "spacy.pipeline.pipes",
    "spacy.syntax.stateclass",
    "spacy.syntax._state",
    "spacy.tokenizer",
--- a/spacy/pipeline.pxd
+++ b/spacy/pipeline.pxd
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -0,0 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .pipes import Tagger, DependencyParser, EntityRecognizer  # noqa
 from .pipes import TextCategorizer, Tensorizer, Pipe  # noqa
 from .entityruler import EntityRuler  # noqa
 from .hooks import SentenceSegmenter, SimilarityHook  # noqa
 from .functions import merge_entities, merge_noun_chunks, merge_subtokens  # noqa
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -0,0 +1,170 @@
 # coding: utf8
 from __future__ import unicode_literals
 from collections import defaultdict
 import srsly
 from ..errors import Errors
 from ..compat import basestring_
 from ..util import ensure_path
 from ..tokens import Span
 from ..matcher import Matcher, PhraseMatcher
 class EntityRuler(object):
    name = "entity_ruler"
    def __init__(self, nlp, **cfg):
        """Initialise the entitiy ruler. If patterns are supplied here, they
        need to be a list of dictionaries with a `"label"` and `"pattern"`
        key. A pattern can either be a token pattern (list) or a phrase pattern
        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
        nlp (Language): The shared nlp object to pass the vocab to the matchers
            and process phrase patterns.
        patterns (iterable): Optional patterns to load in.
        overwrite_ents (bool): If existing entities are present, e.g. entities
            added by the model, overwrite them by matches if necessary.
        **cfg: Other config parameters. If pipeline component is loaded as part
            of a model pipeline, this will include all keyword arguments passed
            to `spacy.load`.
        RETURNS (EntityRuler): The newly constructed object.
        """
        self.nlp = nlp
        self.overwrite = cfg.get("overwrite_ents", False)
        self.token_patterns = defaultdict(list)
        self.phrase_patterns = defaultdict(list)
        self.matcher = Matcher(nlp.vocab)
        self.phrase_matcher = PhraseMatcher(nlp.vocab)
        patterns = cfg.get("patterns")
        if patterns is not None:
            self.add_patterns(patterns)
    def __len__(self):
        """The number of all patterns added to the entity ruler."""
        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
        return n_token_patterns + n_phrase_patterns
    def __contains__(self, label):
        """Whether a label is present in the patterns."""
        return label in self.token_patterns or label in self.phrase_patterns
    def __call__(self, doc):
        """Find matches in document and add them as entities.
        doc (Doc): The Doc object in the pipeline.
        RETURNS (Doc): The Doc with added entities, if available.
        """
        matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
        matches = set(
            [(m_id, start, end) for m_id, start, end in matches if start != end]
        )
        get_sort_key = lambda m: (m[2] - m[1], m[1])
        matches = sorted(matches, key=get_sort_key, reverse=True)
        entities = list(doc.ents)
        new_entities = []
        seen_tokens = set()
        for match_id, start, end in matches:
            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
                continue
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
                new_entities.append(Span(doc, start, end, label=match_id))
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]
                seen_tokens.update(range(start, end))
        doc.ents = entities + new_entities
        return doc
    @property
    def labels(self):
        """All labels present in the match patterns.
        RETURNS (set): The string labels.
        """
        all_labels = set(self.token_patterns.keys())
        all_labels.update(self.phrase_patterns.keys())
        return all_labels
    @property
    def patterns(self):
        """Get all patterns that were added to the entity ruler.
        RETURNS (list): The original patterns, one dictionary per pattern.
        """
        all_patterns = []
        for label, patterns in self.token_patterns.items():
            for pattern in patterns:
                all_patterns.append({"label": label, "pattern": pattern})
        for label, patterns in self.phrase_patterns.items():
            for pattern in patterns:
                all_patterns.append({"label": label, "pattern": pattern.text})
        return all_patterns
    def add_patterns(self, patterns):
        """Add patterns to the entitiy ruler. A pattern can either be a token
        pattern (list of dicts) or a phrase pattern (string). For example:
        {'label': 'ORG', 'pattern': 'Apple'}
        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
        patterns (list): The patterns to add.
        """
        for entry in patterns:
            label = entry["label"]
            pattern = entry["pattern"]
            if isinstance(pattern, basestring_):
                self.phrase_patterns[label].append(self.nlp(pattern))
            elif isinstance(pattern, list):
                self.token_patterns[label].append(pattern)
            else:
                raise ValueError(Errors.E097.format(pattern=pattern))
        for label, patterns in self.token_patterns.items():
            self.matcher.add(label, None, *patterns)
        for label, patterns in self.phrase_patterns.items():
            self.phrase_matcher.add(label, None, *patterns)
    def from_bytes(self, patterns_bytes, **kwargs):
        """Load the entity ruler from a bytestring.
        patterns_bytes (bytes): The bytestring to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.
        """
        patterns = srsly.msgpack_loads(patterns_bytes)
        self.add_patterns(patterns)
        return self
    def to_bytes(self, **kwargs):
        """Serialize the entity ruler patterns to a bytestring.
        RETURNS (bytes): The serialized patterns.
        """
        return srsly.msgpack_dumps(self.patterns)
    def from_disk(self, path, **kwargs):
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.
        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.
        """
        path = ensure_path(path)
        path = path.with_suffix(".jsonl")
        patterns = srsly.read_jsonl(path)
        self.add_patterns(patterns)
        return self
    def to_disk(self, path, **kwargs):
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).
        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.
        """
        path = ensure_path(path)
        path = path.with_suffix(".jsonl")
        srsly.write_jsonl(path, self.patterns)
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -0,0 +1,48 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ..matcher import Matcher
 # TODO: replace doc.merge with doc.retokenize
 def merge_noun_chunks(doc):
    """Merge noun chunks into a single token.
    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged noun chunks.
    """
    if not doc.is_parsed:
        return doc
    spans = [
        (np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks
    ]
    for start, end, tag, dep in spans:
        doc.merge(start, end, tag=tag, dep=dep)
    return doc
 def merge_entities(doc):
    """Merge entities into a single token.
    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged noun entities.
    """
    spans = [
        (e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents
    ]
    for start, end, tag, dep, ent_type in spans:
        doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
    return doc
 def merge_subtokens(doc, label="subtok"):
    merger = Matcher(doc.vocab)
    merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
    matches = merger(doc)
    spans = [doc[start : end + 1] for _, start, end in matches]
    offsets = [(span.start_char, span.end_char) for span in spans]
    for start_char, end_char in offsets:
        doc.merge(start_char, end_char)
    return doc
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@ -0,0 +1,100 @@
 # coding: utf8
 from __future__ import unicode_literals
 from thinc.t2v import Pooling, max_pool, mean_pool
 from thinc.neural._classes.difference import Siamese, CauchySimilarity
 from .pipes import Pipe
 from .._ml import link_vectors_to_models
 class SentenceSegmenter(object):
    """A simple spaCy hook, to allow custom sentence boundary detection logic
    (that doesn't require the dependency parse). To change the sentence
    boundary detection strategy, pass a generator function `strategy` on
    initialization, or assign a new strategy to the .strategy attribute.
    Sentence detection strategies should be generators that take `Doc` objects
    and yield `Span` objects for each sentence.
    """
    name = "sentencizer"
    def __init__(self, vocab, strategy=None):
        self.vocab = vocab
        if strategy is None or strategy == "on_punct":
            strategy = self.split_on_punct
        self.strategy = strategy
    def __call__(self, doc):
        doc.user_hooks["sents"] = self.strategy
        return doc
    @staticmethod
    def split_on_punct(doc):
        start = 0
        seen_period = False
        for i, word in enumerate(doc):
            if seen_period and not word.is_punct:
                yield doc[start : word.i]
                start = word.i
                seen_period = False
            elif word.text in [".", "!", "?"]:
                seen_period = True
        if start < len(doc):
            yield doc[start : len(doc)]
 class SimilarityHook(Pipe):
    """
    Experimental: A pipeline component to install a hook for supervised
    similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
    documents. The similarity model can be any object obeying the Thinc `Model`
    interface. By default, the model concatenates the elementwise mean and
    elementwise max of the two tensors, and compares them using the
    Cauchy-like similarity function from Chen (2013):
        >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
    Where W is a vector of dimension weights, initialized to 1.
    """
    name = "similarity"
    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
        self.cfg = dict(cfg)
    @classmethod
    def Model(cls, length):
        return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
    def __call__(self, doc):
        """Install similarity hook"""
        doc.user_hooks["similarity"] = self.predict
        return doc
    def pipe(self, docs, **kwargs):
        for doc in docs:
            yield self(doc)
    def predict(self, doc1, doc2):
        self.require_model()
        return self.model.predict([(doc1, doc2)])
    def update(self, doc1_doc2, golds, sgd=None, drop=0.0):
        self.require_model()
        sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
    def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
        """Allocate model, using width from tensorizer in pipeline.
        gold_tuples (iterable): Gold-standard training data.
        pipeline (list): The pipeline the model is part of.
        """
        if self.model is True:
            self.model = self.Model(pipeline[0].model.nO)
            link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -3,271 +3,39 @@
 # coding: utf8
 from __future__ import unicode_literals
 import numpy
 cimport numpy as np
-from collections import OrderedDict, defaultdict
+
 import numpy
 from collections import OrderedDict
 import srsly
 from thinc.api import chain
 from thinc.v2v import Affine, Maxout, Softmax
 from thinc.misc import LayerNorm
 from thinc.t2v import Pooling, max_pool, mean_pool
 from thinc.neural.util import to_categorical, copy_array
 from thinc.neural._classes.difference import Siamese, CauchySimilarity
-from .tokens.doc cimport Doc
+from ..tokens.doc cimport Doc
-from .syntax.nn_parser cimport Parser
+from ..syntax.nn_parser cimport Parser
-from .syntax import nonproj
+from ..syntax.ner cimport BiluoPushDown
-from .syntax.ner cimport BiluoPushDown
+from ..syntax.arc_eager cimport ArcEager
-from .syntax.arc_eager cimport ArcEager
+from ..morphology cimport Morphology
-from .morphology cimport Morphology
+from ..vocab cimport Vocab
 from .vocab cimport Vocab
 from .syntax import nonproj
 from .matcher import Matcher
-from .matcher import Matcher, PhraseMatcher
+from ..syntax import nonproj
-from .tokens.span import Span
+from ..attrs import POS, ID
-from .attrs import POS, ID
+from ..parts_of_speech import X
-from .parts_of_speech import X
+from .._ml import Tok2Vec, build_tagger_model, build_simple_cnn_text_classifier
-from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
+from .._ml import link_vectors_to_models, zero_init, flatten
-from ._ml import build_simple_cnn_text_classifier
+from .._ml import masked_language_model, create_default_optimizer
-from ._ml import link_vectors_to_models, zero_init, flatten
+from ..errors import Errors, TempErrors
-from ._ml import create_default_optimizer
+from .. import util
 from ._ml import masked_language_model
 from .errors import Errors, TempErrors
 from .compat import basestring_
 from . import util
-class SentenceSegmenter(object):
+def _load_cfg(path):
-    """A simple spaCy hook, to allow custom sentence boundary detection logic
+    if path.exists():
-    (that doesn't require the dependency parse). To change the sentence
+        return srsly.read_json(path)
-    boundary detection strategy, pass a generator function `strategy` on
+    else:
-    initialization, or assign a new strategy to the .strategy attribute.
+        return {}
    Sentence detection strategies should be generators that take `Doc` objects
    and yield `Span` objects for each sentence.
    """
    name = 'sentencizer'
    def __init__(self, vocab, strategy=None):
        self.vocab = vocab
        if strategy is None or strategy == 'on_punct':
            strategy = self.split_on_punct
        self.strategy = strategy
    def __call__(self, doc):
        doc.user_hooks['sents'] = self.strategy
        return doc
    @staticmethod
    def split_on_punct(doc):
        start = 0
        seen_period = False
        for i, word in enumerate(doc):
            if seen_period and not word.is_punct:
                yield doc[start:word.i]
                start = word.i
                seen_period = False
            elif word.text in ['.', '!', '?']:
                seen_period = True
        if start < len(doc):
            yield doc[start:len(doc)]
 def merge_noun_chunks(doc):
    """Merge noun chunks into a single token.
    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged noun chunks.
    """
    if not doc.is_parsed:
        return doc
    spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep)
             for np in doc.noun_chunks]
    for start, end, tag, dep in spans:
        doc.merge(start, end, tag=tag, dep=dep)
    return doc
 def merge_entities(doc):
    """Merge entities into a single token.
    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged noun entities.
    """
    spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)
             for e in doc.ents]
    for start, end, tag, dep, ent_type in spans:
        doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
    return doc
 def merge_subtokens(doc, label='subtok'):
    merger = Matcher(doc.vocab)
    merger.add('SUBTOK', None, [{'DEP': label, 'op': '+'}])
    matches = merger(doc)
    spans = [doc[start:end+1] for _, start, end in matches]
    offsets = [(span.start_char, span.end_char) for span in spans]
    for start_char, end_char in offsets:
        doc.merge(start_char, end_char)
    return doc
 class EntityRuler(object):
    name = 'entity_ruler'
    def __init__(self, nlp, **cfg):
        """Initialise the entitiy ruler. If patterns are supplied here, they
        need to be a list of dictionaries with a `"label"` and `"pattern"`
        key. A pattern can either be a token pattern (list) or a phrase pattern
        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
        nlp (Language): The shared nlp object to pass the vocab to the matchers
            and process phrase patterns.
        patterns (iterable): Optional patterns to load in.
        overwrite_ents (bool): If existing entities are present, e.g. entities
            added by the model, overwrite them by matches if necessary.
        **cfg: Other config parameters. If pipeline component is loaded as part
            of a model pipeline, this will include all keyword arguments passed
            to `spacy.load`.
        RETURNS (EntityRuler): The newly constructed object.
        """
        self.nlp = nlp
        self.overwrite = cfg.get('overwrite_ents', False)
        self.token_patterns = defaultdict(list)
        self.phrase_patterns = defaultdict(list)
        self.matcher = Matcher(nlp.vocab)
        self.phrase_matcher = PhraseMatcher(nlp.vocab)
        patterns = cfg.get('patterns')
        if patterns is not None:
            self.add_patterns(patterns)
    def __len__(self):
        """The number of all patterns added to the entity ruler."""
        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
        return n_token_patterns + n_phrase_patterns
    def __contains__(self, label):
        """Whether a label is present in the patterns."""
        return label in self.token_patterns or label in self.phrase_patterns
    def __call__(self, doc):
        """Find matches in document and add them as entities.
        doc (Doc): The Doc object in the pipeline.
        RETURNS (Doc): The Doc with added entities, if available.
        """
        matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
        matches = set([(m_id, start, end) for m_id, start, end in matches
                       if start != end])
        get_sort_key = lambda m: (m[2] - m[1], m[1])
        matches = sorted(matches, key=get_sort_key, reverse=True)
        entities = list(doc.ents)
        new_entities = []
        seen_tokens = set()
        for match_id, start, end in matches:
            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
                continue
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
                new_entities.append(Span(doc, start, end, label=match_id))
                entities = [e for e in entities
                            if not (e.start < end and e.end > start)]
                seen_tokens.update(range(start, end))
        doc.ents = entities + new_entities
        return doc
    @property
    def labels(self):
        """All labels present in the match patterns.
        RETURNS (set): The string labels.
        """
        all_labels = set(self.token_patterns.keys())
        all_labels.update(self.phrase_patterns.keys())
        return all_labels
    @property
    def patterns(self):
        """Get all patterns that were added to the entity ruler.
        RETURNS (list): The original patterns, one dictionary per pattern.
        """
        all_patterns = []
        for label, patterns in self.token_patterns.items():
            for pattern in patterns:
                all_patterns.append({'label': label, 'pattern': pattern})
        for label, patterns in self.phrase_patterns.items():
            for pattern in patterns:
                all_patterns.append({'label': label, 'pattern': pattern.text})
        return all_patterns
    def add_patterns(self, patterns):
        """Add patterns to the entitiy ruler. A pattern can either be a token
        pattern (list of dicts) or a phrase pattern (string). For example:
        {'label': 'ORG', 'pattern': 'Apple'}
        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
        patterns (list): The patterns to add.
        """
        for entry in patterns:
            label = entry['label']
            pattern = entry['pattern']
            if isinstance(pattern, basestring_):
                self.phrase_patterns[label].append(self.nlp(pattern))
            elif isinstance(pattern, list):
                self.token_patterns[label].append(pattern)
            else:
                raise ValueError(Errors.E097.format(pattern=pattern))
        for label, patterns in self.token_patterns.items():
            self.matcher.add(label, None, *patterns)
        for label, patterns in self.phrase_patterns.items():
            self.phrase_matcher.add(label, None, *patterns)
    def from_bytes(self, patterns_bytes, **kwargs):
        """Load the entity ruler from a bytestring.
        patterns_bytes (bytes): The bytestring to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.
        """
        patterns = srsly.msgpack_loads(patterns_bytes)
        self.add_patterns(patterns)
        return self
    def to_bytes(self, **kwargs):
        """Serialize the entity ruler patterns to a bytestring.
        RETURNS (bytes): The serialized patterns.
        """
        return srsly.msgpack_dumps(self.patterns)
    def from_disk(self, path, **kwargs):
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.
        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.
        """
        path = util.ensure_path(path)
        path = path.with_suffix('.jsonl')
        patterns = srsly.read_jsonl(path)
        self.add_patterns(patterns)
        return self
    def to_disk(self, path, **kwargs):
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).
        path (unicode / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.
        RETURNS (EntityRuler): The loaded entity ruler.
        """
        path = util.ensure_path(path)
        path = path.with_suffix('.jsonl')
        srsly.write_jsonl(path, self.patterns)
 class Pipe(object):
@ -275,6 +43,7 @@ class Pipe(object):
    it defines the interface that components should follow to function as
    components in a spaCy analysis pipeline.
    """
    name = None
    @classmethod
@ -300,7 +69,7 @@ class Pipe(object):
    def require_model(self):
        """Raise an error if the component's model is not initialized."""
-        if getattr(self, 'model', None) in (None, True, False):
+        if getattr(self, "model", None) in (None, True, False):
            raise ValueError(Errors.E109.format(name=self.name))
    def pipe(self, stream, batch_size=128, n_threads=-1):
@ -326,7 +95,7 @@ class Pipe(object):
        """Modify a batch of documents, using pre-computed scores."""
        raise NotImplementedError
-    def update(self, docs, golds, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, drop=0.0, sgd=None, losses=None):
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model.
@ -353,11 +122,11 @@ class Pipe(object):
        raise NotImplementedError
    def create_optimizer(self):
-        return create_default_optimizer(self.model.ops,
+        return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {}))
                                        **self.cfg.get('optimizer', {}))
-    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
+    def begin_training(
-                       **kwargs):
+        self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs
    ):
        """Initialize the pipe for training, using data exampes if available.
        If no model has been initialized yet, the model is added."""
        if self.model is True:
@ -375,70 +144,70 @@ class Pipe(object):
    def to_bytes(self, **exclude):
        """Serialize the pipe to a bytestring."""
        serialize = OrderedDict()
-        serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
+        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
        if self.model in (True, False, None):
-            serialize['model'] = lambda: self.model
+            serialize["model"] = lambda: self.model
        else:
-            serialize['model'] = self.model.to_bytes
+            serialize["model"] = self.model.to_bytes
-        serialize['vocab'] = self.vocab.to_bytes
+        serialize["vocab"] = self.vocab.to_bytes
        return util.to_bytes(serialize, exclude)
    def from_bytes(self, bytes_data, **exclude):
        """Load the pipe from a bytestring."""
        def load_model(b):
            # TODO: Remove this once we don't have to handle previous models
-            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
+            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
-                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
+                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
            if self.model is True:
                self.model = self.Model(**self.cfg)
            self.model.from_bytes(b)
-        deserialize = OrderedDict((
+        deserialize = OrderedDict(
-            ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
+            (
-            ('vocab', lambda b: self.vocab.from_bytes(b)),
+                ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
-            ('model', load_model),
+                ("vocab", lambda b: self.vocab.from_bytes(b)),
-        ))
+                ("model", load_model),
            )
        )
        util.from_bytes(bytes_data, deserialize, exclude)
        return self
    def to_disk(self, path, **exclude):
        """Serialize the pipe to disk."""
        serialize = OrderedDict()
-        serialize['cfg'] = lambda p: srsly.write_json(p, self.cfg)
+        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
-        serialize['vocab'] = lambda p: self.vocab.to_disk(p)
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
        if self.model not in (None, True, False):
-            serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
+            serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
        util.to_disk(path, serialize, exclude)
    def from_disk(self, path, **exclude):
        """Load the pipe from disk."""
        def load_model(p):
            # TODO: Remove this once we don't have to handle previous models
-            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
+            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
-                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
+                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
            if self.model is True:
                self.model = self.Model(**self.cfg)
-            self.model.from_bytes(p.open('rb').read())
+            self.model.from_bytes(p.open("rb").read())
-        deserialize = OrderedDict((
+        deserialize = OrderedDict(
-            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
+            (
-            ('vocab', lambda p: self.vocab.from_disk(p)),
+                ("cfg", lambda p: self.cfg.update(_load_cfg(p))),
-            ('model', load_model),
+                ("vocab", lambda p: self.vocab.from_disk(p)),
-        ))
+                ("model", load_model),
            )
        )
        util.from_disk(path, deserialize, exclude)
        return self
 def _load_cfg(path):
    if path.exists():
        return srsly.read_json(path)
    else:
        return {}
 class Tensorizer(Pipe):
    """Pre-train position-sensitive vectors for tokens."""
-    name = 'tensorizer'
+
    name = "tensorizer"
    @classmethod
    def Model(cls, output_size=300, **cfg):
@ -449,7 +218,7 @@ class Tensorizer(Pipe):
        **cfg: Config parameters.
        RETURNS (Model): A `thinc.neural.Model` or similar instance.
        """
-        input_size = util.env_opt('token_vector_width', cfg.get('input_size', 96))
+        input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96))
        return zero_init(Affine(output_size, input_size, drop_factor=0.0))
    def __init__(self, vocab, model=True, **cfg):
@ -470,7 +239,7 @@ class Tensorizer(Pipe):
        self.model = model
        self.input_models = []
        self.cfg = dict(cfg)
-        self.cfg.setdefault('cnn_maxout_pieces', 3)
+        self.cfg.setdefault("cnn_maxout_pieces", 3)
    def __call__(self, doc):
        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@ -516,10 +285,12 @@ class Tensorizer(Pipe):
        """
        for doc, tensor in zip(docs, tensors):
            if tensor.shape[0] != len(doc):
-                raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
+                raise ValueError(
                    Errors.E076.format(rows=tensor.shape[0], words=len(doc))
                )
            doc.tensor = tensor
-    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
        """Update the model.
        docs (iterable): A batch of `Doc` objects.
@ -545,7 +316,7 @@ class Tensorizer(Pipe):
        for d_input, bp_input in zip(d_inputs, bp_inputs):
            bp_input(d_input, sgd=sgd)
        if losses is not None:
-            losses.setdefault(self.name, 0.)
+            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss
        return loss
@ -553,11 +324,10 @@ class Tensorizer(Pipe):
        ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
        target = self.vocab.vectors.data[ids]
        d_scores = (prediction - target) / prediction.shape[0]
-        loss = (d_scores**2).sum()
+        loss = (d_scores ** 2).sum()
        return loss, d_scores
-    def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None,
+    def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
                        **kwargs):
        """Allocate models, pre-process training data and acquire an
        optimizer.
@ -566,7 +336,7 @@ class Tensorizer(Pipe):
        """
        if pipeline is not None:
            for name, model in pipeline:
-                if getattr(model, 'tok2vec', None):
+                if getattr(model, "tok2vec", None):
                    self.input_models.append(model.tok2vec)
        if self.model is True:
            self.model = self.Model(**self.cfg)
@ -1086,61 +856,6 @@ class ClozeMultitask(Pipe):
            losses[self.name] += loss
 class SimilarityHook(Pipe):
    """
    Experimental: A pipeline component to install a hook for supervised
    similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
    documents. The similarity model can be any object obeying the Thinc `Model`
    interface. By default, the model concatenates the elementwise mean and
    elementwise max of the two tensors, and compares them using the
    Cauchy-like similarity function from Chen (2013):
        >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
    Where W is a vector of dimension weights, initialized to 1.
    """
    name = 'similarity'
    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
        self.model = model
        self.cfg = dict(cfg)
    @classmethod
    def Model(cls, length):
        return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
    def __call__(self, doc):
        """Install similarity hook"""
        doc.user_hooks['similarity'] = self.predict
        return doc
    def pipe(self, docs, **kwargs):
        for doc in docs:
            yield self(doc)
    def predict(self, doc1, doc2):
        self.require_model()
        return self.model.predict([(doc1, doc2)])
    def update(self, doc1_doc2, golds, sgd=None, drop=0.):
        self.require_model()
        sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
    def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
        """Allocate model, using width from tensorizer in pipeline.
        gold_tuples (iterable): Gold-standard training data.
        pipeline (list): The pipeline the model is part of.
        """
        if self.model is True:
            self.model = self.Model(pipeline[0].model.nO)
            link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
 class TextCategorizer(Pipe):
    name = 'textcat'
@ -1299,13 +1014,12 @@ cdef class DependencyParser(Parser):
 cdef class EntityRecognizer(Parser):
-    name = 'ner'
+    name = "ner"
    TransitionSystem = BiluoPushDown
    nr_feature = 6
    def add_multitask_objective(self, target):
-        if target == 'cloze':
+        if target == "cloze":
            cloze = ClozeMultitask(self.vocab)
            self._multitasks.append(cloze)
        else:
@ -1326,8 +1040,8 @@ cdef class EntityRecognizer(Parser):
    def labels(self):
        # Get the labels from the model by looking at the available moves, e.g.
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
-        return [move.split('-')[1] for move in self.move_names
+        return [move.split("-")[1] for move in self.move_names
-                if move[0] in ('B', 'I', 'L', 'U')]
+                if move[0] in ("B", "I", "L", "U")]
 __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']