mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-01 11:43:03 +03:00
Bug fixes to pipeline
This commit is contained in:
parent
8815507f8e
commit
b460533827
|
@ -8,6 +8,7 @@ from thinc.neural import Model, Softmax
|
||||||
import numpy
|
import numpy
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
import cytoolz
|
import cytoolz
|
||||||
|
import util
|
||||||
|
|
||||||
from thinc.api import add, layerize, chain, clone, concatenate
|
from thinc.api import add, layerize, chain, clone, concatenate
|
||||||
from thinc.neural import Model, Maxout, Softmax, Affine
|
from thinc.neural import Model, Maxout, Softmax, Affine
|
||||||
|
@ -42,12 +43,14 @@ class TokenVectorEncoder(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, width=128, embed_size=5000, **cfg):
|
def Model(cls, width=128, embed_size=5000, **cfg):
|
||||||
|
width = util.env_opt('token_vector_width', width)
|
||||||
|
embed_size = util.env_opt('embed_size', embed_size)
|
||||||
return Tok2Vec(width, embed_size, preprocess=None)
|
return Tok2Vec(width, embed_size, preprocess=None)
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.doc2feats = doc2feats()
|
self.doc2feats = doc2feats()
|
||||||
self.model = self.Model() if model is True else model
|
self.model = model
|
||||||
|
|
||||||
def __call__(self, docs, state=None):
|
def __call__(self, docs, state=None):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
|
@ -88,6 +91,11 @@ class TokenVectorEncoder(object):
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def begin_training(self, gold_tuples, pipeline=None):
|
||||||
|
self.doc2feats = doc2feats()
|
||||||
|
if self.model is True:
|
||||||
|
self.model = self.Model()
|
||||||
|
|
||||||
|
|
||||||
class NeuralTagger(object):
|
class NeuralTagger(object):
|
||||||
name = 'nn_tagger'
|
name = 'nn_tagger'
|
||||||
|
@ -117,15 +125,17 @@ class NeuralTagger(object):
|
||||||
guesses = guesses.get()
|
guesses = guesses.get()
|
||||||
return guesses
|
return guesses
|
||||||
|
|
||||||
def set_annotations(self, docs, tag_ids):
|
def set_annotations(self, docs, batch_tag_ids):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
|
cdef int i, j
|
||||||
|
cdef Vocab vocab = self.vocab
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
tag_ids = tag_ids[idx:idx+len(doc)]
|
doc_tag_ids = batch_tag_ids[idx:idx+len(doc)]
|
||||||
for j, tag_id in enumerate(tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
doc.vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
||||||
idx += 1
|
idx += 1
|
||||||
|
|
||||||
def update(self, docs, golds, state=None, drop=0., sgd=None):
|
def update(self, docs, golds, state=None, drop=0., sgd=None):
|
||||||
|
@ -139,25 +149,19 @@ class NeuralTagger(object):
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
|
tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
|
||||||
|
|
||||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
||||||
d_tokvecs = bp_tag_scores(d_tag_scores, sgd)
|
|
||||||
|
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||||
|
|
||||||
bp_tokvecs(d_tokvecs, sgd=sgd)
|
bp_tokvecs(d_tokvecs, sgd=sgd)
|
||||||
|
|
||||||
state['tag_scores'] = tag_scores
|
state['tag_scores'] = tag_scores
|
||||||
state['bp_tag_scores'] = bp_tag_scores
|
|
||||||
state['d_tag_scores'] = d_tag_scores
|
|
||||||
state['tag_loss'] = loss
|
state['tag_loss'] = loss
|
||||||
|
|
||||||
if 'd_tokvecs' in state:
|
|
||||||
state['d_tokvecs'] += d_tokvecs
|
|
||||||
else:
|
|
||||||
state['d_tokvecs'] = d_tokvecs
|
|
||||||
return state
|
return state
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
tag_index = {tag: i for i, tag in enumerate(docs[0].vocab.morphology.tag_names)}
|
tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)}
|
||||||
|
|
||||||
idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
for tag in gold.tags:
|
for tag in gold.tags:
|
||||||
|
@ -165,10 +169,11 @@ class NeuralTagger(object):
|
||||||
idx += 1
|
idx += 1
|
||||||
correct = self.model.ops.xp.array(correct)
|
correct = self.model.ops.xp.array(correct)
|
||||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||||
return (d_scores**2).sum(), d_scores
|
loss = (d_scores**2).sum()
|
||||||
|
d_scores = self.model.ops.asarray(d_scores)
|
||||||
|
return loss, d_scores
|
||||||
|
|
||||||
def begin_training(self, gold_tuples, pipeline=None):
|
def begin_training(self, gold_tuples, pipeline=None):
|
||||||
# Populate tag map, if anything's missing.
|
|
||||||
tag_map = dict(self.vocab.morphology.tag_map)
|
tag_map = dict(self.vocab.morphology.tag_map)
|
||||||
for raw_text, annots_brackets in gold_tuples:
|
for raw_text, annots_brackets in gold_tuples:
|
||||||
for annots, brackets in annots_brackets:
|
for annots, brackets in annots_brackets:
|
||||||
|
@ -176,14 +181,12 @@ class NeuralTagger(object):
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
if tag not in tag_map:
|
if tag not in tag_map:
|
||||||
tag_map[tag] = {POS: X}
|
tag_map[tag] = {POS: X}
|
||||||
|
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
vocab.morphology = Morphology(self.vocab.strings, tag_map,
|
vocab.morphology = Morphology(vocab.strings, tag_map,
|
||||||
self.vocab.morphology.lemmatizer)
|
vocab.morphology.lemmatizer)
|
||||||
self.model = Softmax(self.vocab.morphology.n_tags)
|
self.model = Softmax(self.vocab.morphology.n_tags)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(LinearParser):
|
cdef class EntityRecognizer(LinearParser):
|
||||||
"""
|
"""
|
||||||
Annotate named entities on Doc objects.
|
Annotate named entities on Doc objects.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user