mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
169 lines
5.9 KiB
Cython
169 lines
5.9 KiB
Cython
# cython: infer_types=True
|
|
# cython: profile=True
|
|
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
from thinc.api import chain, layerize, with_getitem
|
|
from thinc.neural import Model, Softmax
|
|
import numpy
|
|
cimport numpy as np
|
|
|
|
from .tokens.doc cimport Doc
|
|
from .syntax.parser cimport Parser
|
|
#from .syntax.beam_parser cimport BeamParser
|
|
from .syntax.ner cimport BiluoPushDown
|
|
from .syntax.arc_eager cimport ArcEager
|
|
from .tagger import Tagger
|
|
from .gold cimport GoldParse
|
|
|
|
from thinc.api import add, layerize, chain, clone, concatenate
|
|
from thinc.neural import Model, Maxout, Softmax, Affine
|
|
from thinc.neural._classes.hash_embed import HashEmbed
|
|
from thinc.neural.util import to_categorical
|
|
|
|
from thinc.neural._classes.convolution import ExtractWindow
|
|
from thinc.neural._classes.resnet import Residual
|
|
from thinc.neural._classes.batchnorm import BatchNorm as BN
|
|
|
|
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP
|
|
from ._ml import flatten, get_col, doc2feats
|
|
|
|
|
|
|
|
class TokenVectorEncoder(object):
|
|
'''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
|
|
def __init__(self, vocab, token_vector_width, **cfg):
|
|
self.vocab = vocab
|
|
self.doc2feats = doc2feats()
|
|
self.model = self.build_model(vocab.lang, token_vector_width, **cfg)
|
|
self.tagger = chain(
|
|
self.model,
|
|
Softmax(self.vocab.morphology.n_tags,
|
|
token_vector_width))
|
|
|
|
def build_model(self, lang, width, embed_size=5000, **cfg):
|
|
cols = self.doc2feats.cols
|
|
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}):
|
|
lower = get_col(cols.index(LOWER)) >> (HashEmbed(width, embed_size)
|
|
+HashEmbed(width, embed_size))
|
|
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2)
|
|
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2)
|
|
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2)
|
|
|
|
tok2vec = (
|
|
flatten
|
|
>> (lower | prefix | suffix | shape )
|
|
>> Maxout(width, pieces=3)
|
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
|
>> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3))
|
|
)
|
|
return tok2vec
|
|
|
|
def pipe(self, docs):
|
|
docs = list(docs)
|
|
self.predict_tags(docs)
|
|
for doc in docs:
|
|
yield doc
|
|
|
|
def __call__(self, doc):
|
|
self.predict_tags([doc])
|
|
|
|
def begin_update(self, feats, drop=0.):
|
|
tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop)
|
|
return tokvecs, bp_tokvecs
|
|
|
|
def predict_tags(self, docs, drop=0.):
|
|
cdef Doc doc
|
|
feats = self.doc2feats(docs)
|
|
scores, finish_update = self.tagger.begin_update(feats, drop=drop)
|
|
scores, _ = self.tagger.begin_update(feats, drop=drop)
|
|
idx = 0
|
|
guesses = scores.argmax(axis=1)
|
|
if not isinstance(guesses, numpy.ndarray):
|
|
guesses = guesses.get()
|
|
for i, doc in enumerate(docs):
|
|
tag_ids = guesses[idx:idx+len(doc)]
|
|
for j, tag_id in enumerate(tag_ids):
|
|
doc.vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
|
idx += 1
|
|
|
|
def update(self, docs_feats, golds, drop=0., sgd=None):
|
|
cdef int i, j, idx
|
|
cdef GoldParse gold
|
|
docs, feats = docs_feats
|
|
scores, finish_update = self.tagger.begin_update(feats, drop=drop)
|
|
|
|
tag_index = {tag: i for i, tag in enumerate(docs[0].vocab.morphology.tag_names)}
|
|
|
|
idx = 0
|
|
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
|
for gold in golds:
|
|
for tag in gold.tags:
|
|
correct[idx] = tag_index[tag]
|
|
idx += 1
|
|
correct = self.model.ops.xp.array(correct)
|
|
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
|
finish_update(d_scores, sgd)
|
|
|
|
|
|
cdef class EntityRecognizer(Parser):
|
|
"""
|
|
Annotate named entities on Doc objects.
|
|
"""
|
|
TransitionSystem = BiluoPushDown
|
|
|
|
def add_label(self, label):
|
|
Parser.add_label(self, label)
|
|
if isinstance(label, basestring):
|
|
label = self.vocab.strings[label]
|
|
|
|
#
|
|
#cdef class BeamEntityRecognizer(BeamParser):
|
|
# """
|
|
# Annotate named entities on Doc objects.
|
|
# """
|
|
# TransitionSystem = BiluoPushDown
|
|
#
|
|
# feature_templates = get_feature_templates('ner')
|
|
#
|
|
# def add_label(self, label):
|
|
# Parser.add_label(self, label)
|
|
# if isinstance(label, basestring):
|
|
# label = self.vocab.strings[label]
|
|
# # Set label into serializer. Super hacky :(
|
|
# for attr, freqs in self.vocab.serializer_freqs:
|
|
# if attr == ENT_TYPE and label not in freqs:
|
|
# freqs.append([label, 1])
|
|
# self.vocab._serializer = None
|
|
#
|
|
|
|
cdef class DependencyParser(Parser):
|
|
TransitionSystem = ArcEager
|
|
|
|
def add_label(self, label):
|
|
Parser.add_label(self, label)
|
|
if isinstance(label, basestring):
|
|
label = self.vocab.strings[label]
|
|
|
|
#
|
|
#cdef class BeamDependencyParser(BeamParser):
|
|
# TransitionSystem = ArcEager
|
|
#
|
|
# feature_templates = get_feature_templates('basic')
|
|
#
|
|
# def add_label(self, label):
|
|
# Parser.add_label(self, label)
|
|
# if isinstance(label, basestring):
|
|
# label = self.vocab.strings[label]
|
|
# for attr, freqs in self.vocab.serializer_freqs:
|
|
# if attr == DEP and label not in freqs:
|
|
# freqs.append([label, 1])
|
|
# # Super hacky :(
|
|
# self.vocab._serializer = None
|
|
#
|
|
|
|
#__all__ = [Tagger, DependencyParser, EntityRecognizer, BeamDependencyParser, BeamEntityRecognizer]
|
|
__all__ = [Tagger, DependencyParser, EntityRecognizer]
|