Revert "Revert "WIP on improving parser efficiency""

This reverts commit 532afef4a8.
This commit is contained in:
Matthew Honnibal 2017-05-23 03:06:53 -05:00
parent 532afef4a8
commit 3959d778ac
9 changed files with 119 additions and 94 deletions

View File

@ -9,6 +9,7 @@ from pathlib import Path
import dill import dill
import tqdm import tqdm
from thinc.neural.optimizers import linear_decay from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer
from ..tokens.doc import Doc from ..tokens.doc import Doc
from ..scorer import Scorer from ..scorer import Scorer
@ -81,8 +82,13 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
batch_size = min(batch_size, max_batch_size) batch_size = min(batch_size, max_batch_size)
dropout = linear_decay(orig_dropout, dropout_decay, i*n_train_docs+idx) dropout = linear_decay(orig_dropout, dropout_decay, i*n_train_docs+idx)
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
start = timer()
scorer = nlp.evaluate(corpus.dev_docs(nlp)) scorer = nlp.evaluate(corpus.dev_docs(nlp))
print_progress(i, {}, scorer.scores) end = timer()
n_words = scorer.tokens.tp + scorer.tokens.fn
assert n_words != 0
wps = n_words / (end-start)
print_progress(i, {}, scorer.scores, wps=wps)
with (output_path / 'model.bin').open('wb') as file_: with (output_path / 'model.bin').open('wb') as file_:
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
dill.dump(nlp, file_, -1) dill.dump(nlp, file_, -1)
@ -98,14 +104,14 @@ def _render_parses(i, to_render):
file_.write(html) file_.write(html)
def print_progress(itn, losses, dev_scores): def print_progress(itn, losses, dev_scores, wps=0.0):
# TODO: Fix!
scores = {} scores = {}
for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc',
'ents_p', 'ents_r', 'ents_f']: 'ents_p', 'ents_r', 'ents_f', 'wps']:
scores[col] = 0.0 scores[col] = 0.0
scores.update(losses) scores.update(losses)
scores.update(dev_scores) scores.update(dev_scores)
scores[wps] = wps
tpl = '\t'.join(( tpl = '\t'.join((
'{:d}', '{:d}',
'{dep_loss:.3f}', '{dep_loss:.3f}',
@ -115,7 +121,8 @@ def print_progress(itn, losses, dev_scores):
'{ents_r:.3f}', '{ents_r:.3f}',
'{ents_f:.3f}', '{ents_f:.3f}',
'{tags_acc:.3f}', '{tags_acc:.3f}',
'{token_acc:.3f}')) '{token_acc:.3f}',
'{wps:.1f}'))
print(tpl.format(itn, **scores)) print(tpl.format(itn, **scores))

View File

@ -144,7 +144,7 @@ def _min_edit_path(cand_words, gold_words):
class GoldCorpus(object): class GoldCorpus(object):
"""An annotated corpus, using the JSON file format. Manages """An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER.""" annotations for tagging, dependency parsing and NER."""
def __init__(self, train_path, dev_path, limit=None): def __init__(self, train_path, dev_path, gold_preproc=True, limit=None):
"""Create a GoldCorpus. """Create a GoldCorpus.
train_path (unicode or Path): File or directory of training data. train_path (unicode or Path): File or directory of training data.
@ -184,7 +184,7 @@ class GoldCorpus(object):
n += 1 n += 1
return n return n
def train_docs(self, nlp, shuffle=0, gold_preproc=True, def train_docs(self, nlp, shuffle=0, gold_preproc=False,
projectivize=False): projectivize=False):
train_tuples = self.train_tuples train_tuples = self.train_tuples
if projectivize: if projectivize:
@ -195,7 +195,7 @@ class GoldCorpus(object):
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc)
yield from gold_docs yield from gold_docs
def dev_docs(self, nlp, gold_preproc=True): def dev_docs(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc) gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
gold_docs = nlp.preprocess_gold(gold_docs) gold_docs = nlp.preprocess_gold(gold_docs)
yield from gold_docs yield from gold_docs
@ -203,6 +203,11 @@ class GoldCorpus(object):
@classmethod @classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc): def iter_gold_docs(cls, nlp, tuples, gold_preproc):
for raw_text, paragraph_tuples in tuples: for raw_text, paragraph_tuples in tuples:
if gold_preproc:
raw_text = None
else:
paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples, docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc) gold_preproc)
golds = cls._make_golds(docs, paragraph_tuples) golds = cls._make_golds(docs, paragraph_tuples)
@ -211,15 +216,11 @@ class GoldCorpus(object):
@classmethod @classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc): def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc):
if gold_preproc: if raw_text is not None:
return [Doc(nlp.vocab, words=sent_tuples[0][1])
for sent_tuples in paragraph_tuples]
elif raw_text is not None:
return [nlp.make_doc(raw_text)] return [nlp.make_doc(raw_text)]
else: else:
docs = [Doc(nlp.vocab, words=sent_tuples[0][1]) return [Doc(nlp.vocab, words=sent_tuples[0][1])
for sent_tuples in paragraph_tuples] for sent_tuples in paragraph_tuples]
return merge_sents(docs)
@classmethod @classmethod
def _make_golds(cls, docs, paragraph_tuples): def _make_golds(cls, docs, paragraph_tuples):

View File

@ -334,7 +334,7 @@ class Language(object):
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4): >>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
>>> assert doc.is_parsed >>> assert doc.is_parsed
""" """
#docs = (self.make_doc(text) for text in texts) docs = (self.make_doc(text) for text in texts)
docs = texts docs = texts
for proc in self.pipeline: for proc in self.pipeline:
name = getattr(proc, 'name', None) name = getattr(proc, 'name', None)

View File

@ -215,7 +215,7 @@ cdef class Matcher:
""" """
return len(self._patterns) return len(self._patterns)
def add(self, key, on_match, *patterns): def add(self, key, *patterns, **kwargs):
"""Add a match-rule to the matcher. """Add a match-rule to the matcher.
A match-rule consists of: an ID key, an on_match callback, and one or A match-rule consists of: an ID key, an on_match callback, and one or
more patterns. If the key exists, the patterns are appended to the more patterns. If the key exists, the patterns are appended to the
@ -227,6 +227,7 @@ cdef class Matcher:
descriptors can also include quantifiers. There are currently important descriptors can also include quantifiers. There are currently important
known problems with the quantifiers see the docs. known problems with the quantifiers see the docs.
""" """
on_match = kwargs.get('on_match', None)
for pattern in patterns: for pattern in patterns:
if len(pattern) == 0: if len(pattern) == 0:
msg = ("Cannot add pattern for zero tokens to matcher.\n" msg = ("Cannot add pattern for zero tokens to matcher.\n"

View File

@ -167,7 +167,7 @@ class NeuralTagger(object):
self.model = model self.model = model
def __call__(self, doc): def __call__(self, doc):
tags = self.predict(doc.tensor) tags = self.predict([doc.tensor])
self.set_annotations([doc], tags) self.set_annotations([doc], tags)
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
@ -340,24 +340,6 @@ cdef class NeuralEntityRecognizer(NeuralParser):
nr_feature = 6 nr_feature = 6
def get_token_ids(self, states):
cdef StateClass state
cdef int n_tokens = 6
ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c')
for i, state in enumerate(states):
ids[i, 0] = state.c.B(0)-1
ids[i, 1] = state.c.B(0)
ids[i, 2] = state.c.B(1)
ids[i, 3] = state.c.E(0)
ids[i, 4] = state.c.E(0)-1
ids[i, 5] = state.c.E(0)+1
for j in range(6):
if ids[i, j] >= state.c.length:
ids[i, j] = -1
if ids[i, j] >= 0:
ids[i, j] += state.c.offset
return ids
cdef class BeamDependencyParser(BeamParser): cdef class BeamDependencyParser(BeamParser):
TransitionSystem = ArcEager TransitionSystem = ArcEager

View File

@ -15,7 +15,7 @@ cdef class Parser:
cdef readonly object cfg cdef readonly object cfg
cdef void _parse_step(self, StateC* state, cdef void _parse_step(self, StateC* state,
const float* feat_weights, int* token_ids, float* scores, int* is_valid,
int nr_class, int nr_feat) nogil const float* feat_weights, int nr_class, int nr_feat) nogil
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil

View File

@ -19,6 +19,7 @@ import numpy.random
cimport numpy as np cimport numpy as np
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp.pair cimport pair
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals from cpython.exc cimport PyErr_CheckSignals
from libc.stdint cimport uint32_t, uint64_t from libc.stdint cimport uint32_t, uint64_t
@ -68,6 +69,9 @@ def set_debug(val):
DEBUG = val DEBUG = val
ctypedef pair[int, StateC*] step_t
cdef class precompute_hiddens: cdef class precompute_hiddens:
'''Allow a model to be "primed" by pre-computing input features in bulk. '''Allow a model to be "primed" by pre-computing input features in bulk.
@ -119,6 +123,9 @@ cdef class precompute_hiddens:
self._is_synchronized = True self._is_synchronized = True
return <float*>self._cached.data return <float*>self._cached.data
def get_bp_hiddens(self):
return self._bp_hiddens
def __call__(self, X): def __call__(self, X):
return self.begin_update(X)[0] return self.begin_update(X)[0]
@ -308,7 +315,6 @@ cdef class Parser:
cdef: cdef:
precompute_hiddens state2vec precompute_hiddens state2vec
StateClass state StateClass state
Pool mem
const float* feat_weights const float* feat_weights
StateC* st StateC* st
vector[StateC*] next_step, this_step vector[StateC*] next_step, this_step
@ -336,7 +342,14 @@ cdef class Parser:
cdef int i cdef int i
while not next_step.empty(): while not next_step.empty():
for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True): for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True):
self._parse_step(next_step[i], feat_weights, nr_class, nr_feat) token_ids = <int*>calloc(nr_feat, sizeof(int))
scores = <float*>calloc(nr_class, sizeof(float))
is_valid = <int*>calloc(nr_class, sizeof(int))
self._parse_step(next_step[i], token_ids, scores, is_valid,
feat_weights, nr_class, nr_feat)
free(is_valid)
free(scores)
free(token_ids)
this_step, next_step = next_step, this_step this_step, next_step = next_step, this_step
next_step.clear() next_step.clear()
for st in this_step: for st in this_step:
@ -345,12 +358,8 @@ cdef class Parser:
return states return states
cdef void _parse_step(self, StateC* state, cdef void _parse_step(self, StateC* state,
const float* feat_weights, int* token_ids, float* scores, int* is_valid,
int nr_class, int nr_feat) nogil: const float* feat_weights, int nr_class, int nr_feat) nogil:
token_ids = <int*>calloc(nr_feat, sizeof(int))
scores = <float*>calloc(nr_class, sizeof(float))
is_valid = <int*>calloc(nr_class, sizeof(int))
state.set_context_tokens(token_ids, nr_feat) state.set_context_tokens(token_ids, nr_feat)
sum_state_features(scores, sum_state_features(scores,
feat_weights, token_ids, 1, nr_feat, nr_class) feat_weights, token_ids, 1, nr_feat, nr_class)
@ -359,66 +368,90 @@ cdef class Parser:
action = self.moves.c[guess] action = self.moves.c[guess]
action.do(state, action.label) action.do(state, action.label)
free(is_valid)
free(scores)
free(token_ids)
def update(self, docs_tokvecs, golds, drop=0., sgd=None): def update(self, docs_tokvecs, golds, drop=0., sgd=None):
cdef:
precompute_hiddens state2vec
StateClass state
const float* feat_weights
StateC* st
vector[step_t] next_step, this_step
cdef int[:, ::1] is_valid, token_ids
cdef float[:, ::1] scores, d_scores, costs
int nr_state, nr_feat, nr_class
docs, tokvec_lists = docs_tokvecs docs, tokvec_lists = docs_tokvecs
tokvecs = self.model[0].ops.flatten(tokvec_lists)
if isinstance(docs, Doc) and isinstance(golds, GoldParse): if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs] docs = [docs]
golds = [golds] golds = [golds]
assert len(docs) == len(golds) == len(tokvec_lists)
nr_state = len(docs)
nr_feat = self.nr_feature
nr_class = self.moves.n_moves
token_ids = numpy.zeros((nr_state, nr_feat), dtype='i')
is_valid = numpy.zeros((nr_state, nr_class), dtype='i')
scores = numpy.zeros((nr_state, nr_class), dtype='f')
d_scores = numpy.zeros((nr_state, nr_class), dtype='f')
costs = numpy.zeros((nr_state, nr_class), dtype='f')
tokvecs = self.model[0].ops.flatten(tokvec_lists)
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
cuda_stream, drop)
golds = [self.moves.preprocess_gold(g) for g in golds] golds = [self.moves.preprocess_gold(g) for g in golds]
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, cdef step_t step
drop) cdef int i
for i, state in enumerate(states):
todo = [(s, g) for (s, g) in zip(states, golds) if not state.c.is_final():
if not s.is_final() and g is not None] step.first = i
step.second = state.c
next_step.push_back(step)
self.moves.set_costs(&is_valid[i, 0], &costs[i, 0], state, golds[i])
feat_weights = state2vec.get_feat_weights()
bp_hiddens = state2vec.get_bp_hiddens()
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
backprops = [] backprops = []
cdef float loss = 0.
while len(todo) >= 3:
states, golds = zip(*todo)
token_ids = self.get_token_ids(states) while next_step.size():
vector, bp_vector = state2vec.begin_update(token_ids, drop=drop) # Allocate these each step, so copy an be async
scores, bp_scores = vec2scores.begin_update(vector, drop=drop) np_token_ids = numpy.zeros((nr_state, nr_feat), dtype='i')
np_d_scores = numpy.zeros((nr_state, nr_class), dtype='f')
token_ids = np_token_ids
d_scores = np_d_scores
for step in next_step:
i = step.first
st = step.second
self._parse_step(st, &token_ids[i, 0],
&scores[i, 0], &is_valid[i, 0],
feat_weights, nr_class, nr_feat)
cpu_log_loss(&d_scores[i, 0],
&costs[i, 0], &is_valid[i, 0], &scores[i, 0], nr_class)
backprops.append((
get_async(cuda_stream, np_token_ids),
get_async(cuda_stream, np_d_scores)))
this_step, next_step = next_step, this_step
next_step.clear()
for step in this_step:
i = step.first
st = step.second
if not st.is_final():
next_step.push_back(step)
self.moves.set_costs(&is_valid[i, 0], &costs[i, 0],
states[i], golds[i])
cuda_stream.synchronize()
for gpu_token_ids, gpu_d_scores in backprops:
d_features = bp_hiddens((gpu_d_scores, gpu_token_ids), sgd)
d_features *= (gpu_token_ids >= 0).reshape((nr_state, nr_feat, 1))
d_scores = self.get_batch_loss(states, golds, scores) xp = self.model[0].ops.xp
d_vector = bp_scores(d_scores, sgd=sgd)
if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to CPU, asynchronously
backprops.append((
get_async(cuda_stream, token_ids),
get_async(cuda_stream, d_vector),
bp_vector
))
else:
backprops.append((token_ids, d_vector, bp_vector))
self.transition_batch(states, scores)
todo = [st for st in todo if not st[0].is_final()]
# Tells CUDA to block, so our async copies complete.
if cuda_stream is not None:
cuda_stream.synchronize()
d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
xp = state2vec.ops.xp # Handle for numpy/cupy
for token_ids, d_vector, bp_vector in backprops:
d_state_features = bp_vector(d_vector, sgd=sgd)
active_feats = token_ids * (token_ids >= 0)
active_feats = active_feats.reshape((token_ids.shape[0], token_ids.shape[1], 1))
if hasattr(xp, 'scatter_add'): if hasattr(xp, 'scatter_add'):
xp.scatter_add(d_tokvecs, xp.scatter_add(d_tokvecs, gpu_token_ids, d_features)
token_ids, d_state_features * active_feats)
else: else:
xp.add.at(d_tokvecs, xp.add.at(d_tokvecs, gpu_token_ids, d_features)
token_ids, d_state_features * active_feats)
return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
def get_batch_model(self, batch_size, tokvecs, stream, dropout): def get_batch_model(self, batch_size, tokvecs, stream, dropout):

View File

@ -17,8 +17,9 @@ def test_issue429(EN):
doc = EN('a') doc = EN('a')
matcher = Matcher(EN.vocab) matcher = Matcher(EN.vocab)
matcher.add('TEST', on_match=merge_phrases, [{'ORTH': 'a'}]) matcher.add('TEST', [{'ORTH': 'a'}], on_match=merge_phrases)
doc = EN.tokenizer('a b c') doc = EN.make_doc('a b c')
EN.tagger(doc) EN.tagger(doc)
matcher(doc) matcher(doc)
EN.entity(doc) EN.entity(doc)

View File

@ -1,8 +1,8 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
from ..util import get_doc from .util import get_doc
import pytest import pytest