mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 08:42:28 +03:00
Tidy up pipeline
This commit is contained in:
parent
b4d226a3f1
commit
ba5e646219
|
@ -3,26 +3,17 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from thinc.api import chain, layerize, with_getitem
|
|
||||||
import numpy
|
import numpy
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
import cytoolz
|
import cytoolz
|
||||||
import util
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import ujson
|
import ujson
|
||||||
import msgpack
|
import msgpack
|
||||||
|
|
||||||
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
|
from thinc.api import chain
|
||||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
from thinc.v2v import Softmax
|
||||||
from thinc.i2v import HashEmbed
|
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||||
from thinc.t2v import Pooling, max_pool, mean_pool, sum_pool
|
|
||||||
from thinc.t2t import ExtractWindow, ParametricAttention
|
|
||||||
from thinc.misc import Residual
|
|
||||||
from thinc.misc import BatchNorm as BN
|
|
||||||
from thinc.misc import LayerNorm as LN
|
|
||||||
|
|
||||||
from thinc.neural.util import to_categorical
|
from thinc.neural.util import to_categorical
|
||||||
|
|
||||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||||
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
@ -30,29 +21,23 @@ from .syntax.nn_parser cimport Parser
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .syntax.ner cimport BiluoPushDown
|
from .syntax.ner cimport BiluoPushDown
|
||||||
from .syntax.arc_eager cimport ArcEager
|
from .syntax.arc_eager cimport ArcEager
|
||||||
from .tagger import Tagger
|
|
||||||
from .syntax.stateclass cimport StateClass
|
|
||||||
from .gold cimport GoldParse
|
|
||||||
from .morphology cimport Morphology
|
from .morphology cimport Morphology
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .compat import json_dumps
|
from .compat import json_dumps
|
||||||
|
|
||||||
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
|
from .attrs import POS
|
||||||
from ._ml import rebatch, Tok2Vec, flatten
|
|
||||||
from ._ml import build_text_classifier, build_tagger_model
|
|
||||||
from ._ml import link_vectors_to_models
|
|
||||||
from .parts_of_speech import X
|
from .parts_of_speech import X
|
||||||
|
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
||||||
|
from ._ml import link_vectors_to_models
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
class SentenceSegmenter(object):
|
class SentenceSegmenter(object):
|
||||||
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||||
(that doesn't require the dependency parse).
|
(that doesn't require the dependency parse). To change the sentence
|
||||||
|
boundary detection strategy, pass a generator function `strategy` on
|
||||||
To change the sentence boundary detection strategy, pass a generator
|
initialization, or assign a new strategy to the .strategy attribute.
|
||||||
function `strategy` on initialization, or assign a new strategy to
|
|
||||||
the .strategy attribute.
|
|
||||||
|
|
||||||
Sentence detection strategies should be generators that take `Doc` objects
|
Sentence detection strategies should be generators that take `Doc` objects
|
||||||
and yield `Span` objects for each sentence.
|
and yield `Span` objects for each sentence.
|
||||||
"""
|
"""
|
||||||
|
@ -74,16 +59,20 @@ class SentenceSegmenter(object):
|
||||||
seen_period = False
|
seen_period = False
|
||||||
for i, word in enumerate(doc):
|
for i, word in enumerate(doc):
|
||||||
if seen_period and not word.is_punct:
|
if seen_period and not word.is_punct:
|
||||||
yield doc[start : word.i]
|
yield doc[start:word.i]
|
||||||
start = word.i
|
start = word.i
|
||||||
seen_period = False
|
seen_period = False
|
||||||
elif word.text in ['.', '!', '?']:
|
elif word.text in ['.', '!', '?']:
|
||||||
seen_period = True
|
seen_period = True
|
||||||
if start < len(doc):
|
if start < len(doc):
|
||||||
yield doc[start : len(doc)]
|
yield doc[start:len(doc)]
|
||||||
|
|
||||||
|
|
||||||
class Pipe(object):
|
class Pipe(object):
|
||||||
|
"""This class is not instantiated directly. Components inherit from it, and
|
||||||
|
it defines the interface that components should follow to function as
|
||||||
|
components in a spaCy analysis pipeline.
|
||||||
|
"""
|
||||||
name = None
|
name = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -149,8 +138,7 @@ class Pipe(object):
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
"""Modify the pipe's model, to use the given parameter values.
|
"""Modify the pipe's model, to use the given parameter values."""
|
||||||
"""
|
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
@ -235,8 +223,8 @@ class Tensorizer(Pipe):
|
||||||
"""Construct a new statistical model. Weights are not allocated on
|
"""Construct a new statistical model. Weights are not allocated on
|
||||||
initialisation.
|
initialisation.
|
||||||
|
|
||||||
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
|
vocab (Vocab): A `Vocab` instance. The model must share the same
|
||||||
instance with the `Doc` objects it will process.
|
`Vocab` instance with the `Doc` objects it will process.
|
||||||
model (Model): A `Model` instance or `True` allocate one later.
|
model (Model): A `Model` instance or `True` allocate one later.
|
||||||
**cfg: Config parameters.
|
**cfg: Config parameters.
|
||||||
|
|
||||||
|
@ -280,7 +268,7 @@ class Tensorizer(Pipe):
|
||||||
"""Return a single tensor for a batch of documents.
|
"""Return a single tensor for a batch of documents.
|
||||||
|
|
||||||
docs (iterable): A sequence of `Doc` objects.
|
docs (iterable): A sequence of `Doc` objects.
|
||||||
RETURNS (object): Vector representations for each token in the documents.
|
RETURNS (object): Vector representations for each token in the docs.
|
||||||
"""
|
"""
|
||||||
tokvecs = self.model(docs)
|
tokvecs = self.model(docs)
|
||||||
return tokvecs
|
return tokvecs
|
||||||
|
@ -289,7 +277,7 @@ class Tensorizer(Pipe):
|
||||||
"""Set the tensor attribute for a batch of documents.
|
"""Set the tensor attribute for a batch of documents.
|
||||||
|
|
||||||
docs (iterable): A sequence of `Doc` objects.
|
docs (iterable): A sequence of `Doc` objects.
|
||||||
tokvecs (object): Vector representation for each token in the documents.
|
tokvecs (object): Vector representation for each token in the docs.
|
||||||
"""
|
"""
|
||||||
for doc, tokvecs in zip(docs, tokvecses):
|
for doc, tokvecs in zip(docs, tokvecses):
|
||||||
assert tokvecs.shape[0] == len(doc)
|
assert tokvecs.shape[0] == len(doc)
|
||||||
|
@ -328,12 +316,14 @@ class Tensorizer(Pipe):
|
||||||
|
|
||||||
class Tagger(Pipe):
|
class Tagger(Pipe):
|
||||||
name = 'tagger'
|
name = 'tagger'
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
self.cfg.setdefault('pretrained_dims',
|
||||||
|
self.vocab.vectors.data.shape[1])
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
tags = self.predict([doc])
|
tags = self.predict([doc])
|
||||||
|
@ -353,8 +343,7 @@ class Tagger(Pipe):
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
if not isinstance(guesses, numpy.ndarray):
|
if not isinstance(guesses, numpy.ndarray):
|
||||||
guesses = guesses.get()
|
guesses = guesses.get()
|
||||||
guesses = self.model.ops.unflatten(guesses,
|
guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
|
||||||
[len(d) for d in docs])
|
|
||||||
return guesses
|
return guesses
|
||||||
|
|
||||||
def set_annotations(self, docs, batch_tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
|
@ -387,8 +376,8 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
tag_index = {tag: i
|
||||||
|
for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
|
@ -443,17 +432,18 @@ class Tagger(Pipe):
|
||||||
serialize['model'] = self.model.to_bytes
|
serialize['model'] = self.model.to_bytes
|
||||||
serialize['vocab'] = self.vocab.to_bytes
|
serialize['vocab'] = self.vocab.to_bytes
|
||||||
|
|
||||||
serialize['tag_map'] = lambda: msgpack.dumps(self.vocab.morphology.tag_map,
|
serialize['tag_map'] = lambda: msgpack.dumps(
|
||||||
use_bin_type=True,
|
self.vocab.morphology.tag_map, use_bin_type=True, encoding='utf8')
|
||||||
encoding='utf8')
|
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
token_vector_width = util.env_opt('token_vector_width',
|
token_vector_width = util.env_opt(
|
||||||
self.cfg.get('token_vector_width', 128))
|
'token_vector_width',
|
||||||
self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
|
self.cfg.get('token_vector_width', 128))
|
||||||
|
self.model = self.Model(self.vocab.morphology.n_tags,
|
||||||
|
**self.cfg)
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
def load_tag_map(b):
|
def load_tag_map(b):
|
||||||
|
@ -509,11 +499,11 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
|
|
||||||
class MultitaskObjective(Tagger):
|
class MultitaskObjective(Tagger):
|
||||||
'''Assist training of a parser or tagger, by training a side-objective.
|
"""Experimental: Assist training of a parser or tagger, by training a
|
||||||
|
side-objective.
|
||||||
Experimental
|
"""
|
||||||
'''
|
|
||||||
name = 'nn_labeller'
|
name = 'nn_labeller'
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -530,12 +520,12 @@ class MultitaskObjective(Tagger):
|
||||||
elif hasattr(target, '__call__'):
|
elif hasattr(target, '__call__'):
|
||||||
self.make_label = target
|
self.make_label = target
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError("MultitaskObjective target should be function or "
|
||||||
"MultitaskObjective target should be function or one of "
|
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
|
||||||
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
|
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||||
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
|
self.cfg.setdefault('pretrained_dims',
|
||||||
|
self.vocab.vectors.data.shape[1])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
|
@ -623,20 +613,19 @@ class MultitaskObjective(Tagger):
|
||||||
|
|
||||||
class SimilarityHook(Pipe):
|
class SimilarityHook(Pipe):
|
||||||
"""
|
"""
|
||||||
Experimental
|
Experimental: A pipeline component to install a hook for supervised
|
||||||
|
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
|
||||||
|
documents. The similarity model can be any object obeying the Thinc `Model`
|
||||||
|
interface. By default, the model concatenates the elementwise mean and
|
||||||
|
elementwise max of the two tensors, and compares them using the
|
||||||
|
Cauchy-like similarity function from Chen (2013):
|
||||||
|
|
||||||
A pipeline component to install a hook for supervised similarity into
|
>>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
||||||
Doc objects. Requires a Tensorizer to pre-process documents. The similarity
|
|
||||||
model can be any object obeying the Thinc Model interface. By default,
|
|
||||||
the model concatenates the elementwise mean and elementwise max of the two
|
|
||||||
tensors, and compares them using the Cauchy-like similarity function
|
|
||||||
from Chen (2013):
|
|
||||||
|
|
||||||
similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
|
||||||
|
|
||||||
Where W is a vector of dimension weights, initialized to 1.
|
Where W is a vector of dimension weights, initialized to 1.
|
||||||
"""
|
"""
|
||||||
name = 'similarity'
|
name = 'similarity'
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
|
@ -662,8 +651,7 @@ class SimilarityHook(Pipe):
|
||||||
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
||||||
|
|
||||||
def begin_training(self, _=tuple(), pipeline=None):
|
def begin_training(self, _=tuple(), pipeline=None):
|
||||||
"""
|
"""Allocate model, using width from tensorizer in pipeline.
|
||||||
Allocate model, using width from tensorizer in pipeline.
|
|
||||||
|
|
||||||
gold_tuples (iterable): Gold-standard training data.
|
gold_tuples (iterable): Gold-standard training data.
|
||||||
pipeline (list): The pipeline the model is part of.
|
pipeline (list): The pipeline the model is part of.
|
||||||
|
@ -763,12 +751,14 @@ cdef class DependencyParser(Parser):
|
||||||
for target in []:
|
for target in []:
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
labeller = MultitaskObjective(self.vocab, target=target)
|
||||||
tok2vec = self.model[0]
|
tok2vec = self.model[0]
|
||||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||||
|
tok2vec=tok2vec)
|
||||||
pipeline.append(labeller)
|
pipeline.append(labeller)
|
||||||
self._multitasks.append(labeller)
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (DependencyParser, (self.vocab, self.moves, self.model), None, None)
|
return (DependencyParser, (self.vocab, self.moves, self.model),
|
||||||
|
None, None)
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(Parser):
|
cdef class EntityRecognizer(Parser):
|
||||||
|
@ -781,12 +771,14 @@ cdef class EntityRecognizer(Parser):
|
||||||
for target in []:
|
for target in []:
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
labeller = MultitaskObjective(self.vocab, target=target)
|
||||||
tok2vec = self.model[0]
|
tok2vec = self.model[0]
|
||||||
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
|
labeller.begin_training(gold_tuples, pipeline=pipeline,
|
||||||
|
tok2vec=tok2vec)
|
||||||
pipeline.append(labeller)
|
pipeline.append(labeller)
|
||||||
self._multitasks.append(labeller)
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None)
|
return (EntityRecognizer, (self.vocab, self.moves, self.model),
|
||||||
|
None, None)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']
|
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']
|
||||||
|
|
Loading…
Reference in New Issue
Block a user