mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Tidy up pipeline
This commit is contained in:
parent
b4d226a3f1
commit
ba5e646219
|
@ -3,26 +3,17 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from thinc.api import chain, layerize, with_getitem
|
||||
import numpy
|
||||
cimport numpy as np
|
||||
import cytoolz
|
||||
import util
|
||||
from collections import OrderedDict
|
||||
import ujson
|
||||
import msgpack
|
||||
|
||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||
from thinc.i2v import HashEmbed
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
|
||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
||||
from thinc.misc import Residual
|
||||
from thinc.misc import BatchNorm as BN
|
||||
from thinc.misc import LayerNorm as LN
|
||||
|
||||
from thinc.api import chain
|
||||
from thinc.v2v import Softmax
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||
from thinc.neural.util import to_categorical
|
||||
|
||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||
|
||||
from .tokens.doc cimport Doc
|
||||
|
@ -30,29 +21,23 @@ from .syntax.nn_parser cimport Parser
|
|||
from .syntax import nonproj
|
||||
from .syntax.ner cimport BiluoPushDown
|
||||
from .syntax.arc_eager cimport ArcEager
|
||||
from .tagger import Tagger
|
||||
from .syntax.stateclass cimport StateClass
|
||||
from .gold cimport GoldParse
|
||||
from .morphology cimport Morphology
|
||||
from .vocab cimport Vocab
|
||||
from .syntax import nonproj
|
||||
from .compat import json_dumps
|
||||
|
||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
||||
from ._ml import rebatch, Tok2Vec, flatten
|
||||
from ._ml import build_text_classifier, build_tagger_model
|
||||
from ._ml import link_vectors_to_models
|
||||
from .attrs import POS
|
||||
from .parts_of_speech import X
|
||||
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
||||
from ._ml import link_vectors_to_models
|
||||
from . import util
|
||||
|
||||
|
||||
class SentenceSegmenter(object):
|
||||
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||
(that doesn't require the dependency parse).
|
||||
|
||||
To change the sentence boundary detection strategy, pass a generator
|
||||
function `strategy` on initialization, or assign a new strategy to
|
||||
the .strategy attribute.
|
||||
|
||||
(that doesn't require the dependency parse). To change the sentence
|
||||
boundary detection strategy, pass a generator function `strategy` on
|
||||
initialization, or assign a new strategy to the .strategy attribute.
|
||||
Sentence detection strategies should be generators that take `Doc` objects
|
||||
and yield `Span` objects for each sentence.
|
||||
"""
|
||||
|
@ -74,16 +59,20 @@ class SentenceSegmenter(object):
|
|||
seen_period = False
|
||||
for i, word in enumerate(doc):
|
||||
if seen_period and not word.is_punct:
|
||||
yield doc[start : word.i]
|
||||
yield doc[start:word.i]
|
||||
start = word.i
|
||||
seen_period = False
|
||||
elif word.text in ['.', '!', '?']:
|
||||
seen_period = True
|
||||
if start < len(doc):
|
||||
yield doc[start : len(doc)]
|
||||
yield doc[start:len(doc)]
|
||||
|
||||
|
||||
class Pipe(object):
|
||||
"""This class is not instantiated directly. Components inherit from it, and
|
||||
it defines the interface that components should follow to function as
|
||||
components in a spaCy analysis pipeline.
|
||||
"""
|
||||
name = None
|
||||
|
||||
@classmethod
|
||||
|
@ -149,8 +138,7 @@ class Pipe(object):
|
|||
link_vectors_to_models(self.vocab)
|
||||
|
||||
def use_params(self, params):
|
||||
"""Modify the pipe's model, to use the given parameter values.
|
||||
"""
|
||||
"""Modify the pipe's model, to use the given parameter values."""
|
||||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
|
@ -235,8 +223,8 @@ class Tensorizer(Pipe):
|
|||
"""Construct a new statistical model. Weights are not allocated on
|
||||
initialisation.
|
||||
|
||||
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
|
||||
instance with the `Doc` objects it will process.
|
||||
vocab (Vocab): A `Vocab` instance. The model must share the same
|
||||
`Vocab` instance with the `Doc` objects it will process.
|
||||
model (Model): A `Model` instance or `True` allocate one later.
|
||||
**cfg: Config parameters.
|
||||
|
||||
|
@ -280,7 +268,7 @@ class Tensorizer(Pipe):
|
|||
"""Return a single tensor for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
RETURNS (object): Vector representations for each token in the documents.
|
||||
RETURNS (object): Vector representations for each token in the docs.
|
||||
"""
|
||||
tokvecs = self.model(docs)
|
||||
return tokvecs
|
||||
|
@ -289,7 +277,7 @@ class Tensorizer(Pipe):
|
|||
"""Set the tensor attribute for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
tokvecs (object): Vector representation for each token in the documents.
|
||||
tokvecs (object): Vector representation for each token in the docs.
|
||||
"""
|
||||
for doc, tokvecs in zip(docs, tokvecses):
|
||||
assert tokvecs.shape[0] == len(doc)
|
||||
|
@ -328,12 +316,14 @@ class Tensorizer(Pipe):
|
|||
|
||||
class Tagger(Pipe):
|
||||
name = 'tagger'
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||
self.cfg.setdefault('pretrained_dims',
|
||||
self.vocab.vectors.data.shape[1])
|
||||
|
||||
def __call__(self, doc):
|
||||
tags = self.predict([doc])
|
||||
|
@ -353,8 +343,7 @@ class Tagger(Pipe):
|
|||
guesses = scores.argmax(axis=1)
|
||||
if not isinstance(guesses, numpy.ndarray):
|
||||
guesses = guesses.get()
|
||||
guesses = self.model.ops.unflatten(guesses,
|
||||
[len(d) for d in docs])
|
||||
guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
|
||||
return guesses
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids):
|
||||
|
@ -387,8 +376,8 @@ class Tagger(Pipe):
|
|||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
scores = self.model.ops.flatten(scores)
|
||||
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
||||
|
||||
tag_index = {tag: i
|
||||
for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||
guesses = scores.argmax(axis=1)
|
||||
|
@ -443,17 +432,18 @@ class Tagger(Pipe):
|
|||
serialize['model'] = self.model.to_bytes
|
||||
serialize['vocab'] = self.vocab.to_bytes
|
||||
|
||||
serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
|
||||
use_bin_type=True,
|
||||
encoding='utf8')
|
||||
serialize['tag_map'] = lambda: msgpack.dumps(
|
||||
self.vocab.morphology.tag_map, use_bin_type=True, encoding='utf8')
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
def load_model(b):
|
||||
if self.model is True:
|
||||
token_vector_width = util.env_opt('token_vector_width',
|
||||
self.cfg.get('token_vector_width', 128))
|
||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
||||
token_vector_width = util.env_opt(
|
||||
'token_vector_width',
|
||||
self.cfg.get('token_vector_width', 128))
|
||||
self.model = self.Model(self.vocab.morphology.n_tags,
|
||||
**self.cfg)
|
||||
self.model.from_bytes(b)
|
||||
|
||||
def load_tag_map(b):
|
||||
|
@ -509,11 +499,11 @@ class Tagger(Pipe):
|
|||
|
||||
|
||||
class MultitaskObjective(Tagger):
|
||||
'''Assist training of a parser or tagger, by training a side-objective.
|
||||
|
||||
Experimental
|
||||
'''
|
||||
"""Experimental: Assist training of a parser or tagger, by training a
|
||||
side-objective.
|
||||
"""
|
||||
name = 'nn_labeller'
|
||||
|
||||
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -530,12 +520,12 @@ class MultitaskObjective(Tagger):
|
|||
elif hasattr(target, '__call__'):
|
||||
self.make_label = target
|
||||
else:
|
||||
raise ValueError(
|
||||
"MultitaskObjective target should be function or one of "
|
||||
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
|
||||
raise ValueError("MultitaskObjective target should be function or "
|
||||
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
||||
self.cfg.setdefault('pretrained_dims',
|
||||
self.vocab.vectors.data.shape[1])
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -623,20 +613,19 @@ class MultitaskObjective(Tagger):
|
|||
|
||||
class SimilarityHook(Pipe):
|
||||
"""
|
||||
Experimental
|
||||
Experimental: A pipeline component to install a hook for supervised
|
||||
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
|
||||
documents. The similarity model can be any object obeying the Thinc `Model`
|
||||
interface. By default, the model concatenates the elementwise mean and
|
||||
elementwise max of the two tensors, and compares them using the
|
||||
Cauchy-like similarity function from Chen (2013):
|
||||
|
||||
A pipeline component to install a hook for supervised similarity into
|
||||
Doc objects. Requires a Tensorizer to pre-process documents. The similarity
|
||||
model can be any object obeying the Thinc Model interface. By default,
|
||||
the model concatenates the elementwise mean and elementwise max of the two
|
||||
tensors, and compares them using the Cauchy-like similarity function
|
||||
from Chen (2013):
|
||||
|
||||
similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
||||
>>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
||||
|
||||
Where W is a vector of dimension weights, initialized to 1.
|
||||
"""
|
||||
name = 'similarity'
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -662,8 +651,7 @@ class SimilarityHook(Pipe):
|
|||
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
||||
|
||||
def begin_training(self, _=tuple(), pipeline=None):
|
||||
"""
|
||||
Allocate model, using width from tensorizer in pipeline.
|
||||
"""Allocate model, using width from tensorizer in pipeline.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
|
@ -763,12 +751,14 @@ cdef class DependencyParser(Parser):
|
|||
for target in []:
|
||||
labeller = MultitaskObjective(self.vocab, target=target)
|
||||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||
tok2vec=tok2vec)
|
||||
pipeline.append(labeller)
|
||||
self._multitasks.append(labeller)
|
||||
|
||||
def __reduce__(self):
|
||||
return (DependencyParser, (self.vocab, self.moves, self.model), None, None)
|
||||
return (DependencyParser, (self.vocab, self.moves, self.model),
|
||||
None, None)
|
||||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
|
@ -781,12 +771,14 @@ cdef class EntityRecognizer(Parser):
|
|||
for target in []:
|
||||
labeller = MultitaskObjective(self.vocab, target=target)
|
||||
tok2vec = self.model[0]
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
||||
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||
tok2vec=tok2vec)
|
||||
pipeline.append(labeller)
|
||||
self._multitasks.append(labeller)
|
||||
|
||||
def __reduce__(self):
|
||||
return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
||||
return (EntityRecognizer, (self.vocab, self.moves, self.model),
|
||||
None, None)
|
||||
|
||||
|
||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']
|
||||
|
|
Loading…
Reference in New Issue
Block a user