Tidy up syntax

This commit is contained in:
ines 2017-10-27 19:45:57 +02:00
parent 5167a0cce2
commit b4d226a3f1
8 changed files with 195 additions and 230 deletions

View File

@ -2,7 +2,7 @@
# cython: profile=True # cython: profile=True
cimport numpy as np cimport numpy as np
import numpy import numpy
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport PyObject, Py_XDECREF
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
from thinc.extra.search import MaxViolation from thinc.extra.search import MaxViolation
from thinc.typedefs cimport hash_t, class_t from thinc.typedefs cimport hash_t, class_t
@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation
from .transition_system cimport TransitionSystem, Transition from .transition_system cimport TransitionSystem, Transition
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..tokens.doc cimport Doc
# These are passed as callbacks to thinc.search.Beam # These are passed as callbacks to thinc.search.Beam
@ -50,7 +49,7 @@ cdef class ParserBeam(object):
cdef public object dones cdef public object dones
def __init__(self, TransitionSystem moves, states, golds, def __init__(self, TransitionSystem moves, states, golds,
int width, float density): int width, float density):
self.moves = moves self.moves = moves
self.states = states self.states = states
self.golds = golds self.golds = golds
@ -59,7 +58,8 @@ cdef class ParserBeam(object):
cdef StateClass state, st cdef StateClass state, st
for state in states: for state in states:
beam = Beam(self.moves.n_moves, width, density) beam = Beam(self.moves.n_moves, width, density)
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent) beam.initialize(self.moves.init_beam_state, state.c.length,
state.c._sent)
for i in range(beam.width): for i in range(beam.width):
st = <StateClass>beam.at(i) st = <StateClass>beam.at(i)
st.c.offset = state.c.offset st.c.offset = state.c.offset
@ -74,7 +74,8 @@ cdef class ParserBeam(object):
@property @property
def is_done(self): def is_done(self):
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams)) return all(b.is_done or self.dones[i]
for i, b in enumerate(self.beams))
def __getitem__(self, i): def __getitem__(self, i):
return self.beams[i] return self.beams[i]
@ -126,7 +127,8 @@ cdef class ParserBeam(object):
for i in range(beam.size): for i in range(beam.size):
state = <StateClass>beam.at(i) state = <StateClass>beam.at(i)
if not state.c.is_final(): if not state.c.is_final():
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) self.moves.set_costs(beam.is_valid[i], beam.costs[i],
state, gold)
if follow_gold: if follow_gold:
for j in range(beam.nr_class): for j in range(beam.nr_class):
if beam.costs[i][j] >= 1: if beam.costs[i][j] >= 1:
@ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens):
c_ids += ids.shape[1] c_ids += ids.shape[1]
return ids return ids
nr_update = 0 nr_update = 0
def update_beam(TransitionSystem moves, int nr_feature, int max_steps, def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
states, golds, states, golds,
state2vec, vec2scores, state2vec, vec2scores,
@ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
if pbeam.is_done and gbeam.is_done: if pbeam.is_done and gbeam.is_done:
break break
# The beam maps let us find the right row in the flattened scores # The beam maps let us find the right row in the flattened scores
# arrays for each state. States are identified by (example id, history). # arrays for each state. States are identified by (example id,
# We keep a different beam map for each step (since we'll have a flat # history). We keep a different beam map for each step (since we'll
# scores array for each step). The beam map will let us take the per-state # have a flat scores array for each step). The beam map will let us
# losses, and compute the gradient for each (step, state, class). # take the per-state losses, and compute the gradient for each (step,
# state, class).
beam_maps.append({}) beam_maps.append({})
# Gather all states from the two beams in a list. Some stats may occur # Gather all states from the two beams in a list. Some stats may occur
# in both beams. To figure out which beam each state belonged to, # in both beams. To figure out which beam each state belonged to,
# we keep two lists of indices, p_indices and g_indices # we keep two lists of indices, p_indices and g_indices
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update) states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
nr_update)
if not states: if not states:
break break
# Now that we have our flat list of states, feed them through the model # Now that we have our flat list of states, feed them through the model
token_ids = get_token_ids(states, nr_feature) token_ids = get_token_ids(states, nr_feature)
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
if hist_feats: if hist_feats:
hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i') hists = numpy.asarray([st.history[:hist_feats] for st in states],
scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop) dtype='i')
scores, bp_scores = vec2scores.begin_update((vectors, hists),
drop=drop)
else: else:
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
@ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
# Unpack the flat scores into lists for the two beams. The indices arrays # Unpack the flat scores into lists for the two beams. The indices arrays
# tell us which example and state the scores-row refers to. # tell us which example and state the scores-row refers to.
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices] p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices] for indices in p_indices]
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
for indices in g_indices]
# Now advance the states in the beams. The gold beam is contrained to # Now advance the states in the beams. The gold beam is contrained to
# to follow only gold analyses. # to follow only gold analyses.
pbeam.advance(p_scores) pbeam.advance(p_scores)
@ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
def get_gradient(nr_class, beam_maps, histories, losses): def get_gradient(nr_class, beam_maps, histories, losses):
""" """The global model assigns a loss to each parse. The beam scores
The global model assigns a loss to each parse. The beam scores
are additive, so the same gradient is applied to each action are additive, so the same gradient is applied to each action
in the history. This gives the gradient of a single *action* in the history. This gives the gradient of a single *action*
for a beam state -- so we have "the gradient of loss for taking for a beam state -- so we have "the gradient of loss for taking
@ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
if loss != 0.0 and not numpy.isnan(loss): if loss != 0.0 and not numpy.isnan(loss):
nr_step = max(nr_step, len(hist)) nr_step = max(nr_step, len(hist))
for i in range(nr_step): for i in range(nr_step):
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f')) grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
dtype='f'))
assert len(histories) == len(losses) assert len(histories) == len(losses)
for eg_id, hists in enumerate(histories): for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists): for loss, hist in zip(losses[eg_id], hists):
@ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses):
grads[j][i, clas] += loss grads[j][i, clas] += loss
key = key + tuple([clas]) key = key + tuple([clas])
return grads return grads

View File

@ -1 +0,0 @@
# test

View File

@ -4,24 +4,16 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport Py_INCREF
import ctypes
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from collections import OrderedDict from collections import OrderedDict
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
import numpy
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token from ._state cimport StateC
from .nonproj import is_nonproj_tree from .nonproj import is_nonproj_tree
from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse from ..gold cimport GoldParse, GoldParseC
from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
from ..lexeme cimport Lexeme
from ..structs cimport TokenC from ..structs cimport TokenC
@ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem):
@classmethod @classmethod
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', actions = kwargs.get('actions', OrderedDict((
OrderedDict(( (SHIFT, ['']),
(SHIFT, ['']), (REDUCE, ['']),
(REDUCE, ['']), (RIGHT, []),
(RIGHT, []), (LEFT, []),
(LEFT, []), (BREAK, ['ROOT']))
(BREAK, ['ROOT']) ))
)))
seen_actions = set() seen_actions = set()
for label in kwargs.get('left_labels', []): for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT': if label.upper() != 'ROOT':
@ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem):
if gold.cand_to_gold[i] is None: if gold.cand_to_gold[i] is None:
continue continue
if state.safe_get(i).dep: if state.safe_get(i).dep:
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep])) predicted.add((i, state.H(i),
self.strings[state.safe_get(i).dep]))
else: else:
predicted.add((i, state.H(i), 'ROOT')) predicted.add((i, state.H(i), 'ROOT'))
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
@ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem):
if not self.has_gold(gold): if not self.has_gold(gold):
return None return None
for i in range(gold.length): for i in range(gold.length):
if gold.heads[i] is None or gold.labels[i] is None: # Missing values # Missing values
if gold.heads[i] is None or gold.labels[i] is None:
gold.c.heads[i] = i gold.c.heads[i] = i
gold.c.has_dep[i] = False gold.c.has_dep[i] = False
else: else:
@ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem):
# Check projectivity --- leading cause # Check projectivity --- leading cause
if is_nonproj_tree(gold.heads): if is_nonproj_tree(gold.heads):
raise ValueError( raise ValueError(
"Could not find a gold-standard action to supervise the dependency " "Could not find a gold-standard action to supervise the "
"parser.\n" "dependency parser. Likely cause: the tree is "
"Likely cause: the tree is non-projective (i.e. it has crossing " "non-projective (i.e. it has crossing arcs -- see "
"arcs -- see spacy/syntax/nonproj.pyx for definitions)\n" "spacy/syntax/nonproj.pyx for definitions). The ArcEager "
"The ArcEager transition system only supports projective trees.\n" "transition system only supports projective trees. To "
"To learn non-projective representations, transform the data " "learn non-projective representations, transform the data "
"before training and after parsing. Either pass make_projective=True " "before training and after parsing. Either pass "
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data") "make_projective=True to the GoldParse class, or use "
"spacy.syntax.nonproj.preprocess_training_data.")
else: else:
print(gold.orig_annot) print(gold.orig_annot)
print(gold.words) print(gold.words)
@ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem):
print(gold.labels) print(gold.labels)
print(gold.sent_starts) print(gold.sent_starts)
raise ValueError( raise ValueError(
"Could not find a gold-standard action to supervise the dependency " "Could not find a gold-standard action to supervise the"
"parser.\n" "dependency parser. The GoldParse was projective. The "
"The GoldParse was projective.\n" "transition system has %d actions. State at failure: %s"
"The transition system has %d actions.\n" % (self.n_moves, stcls.print_state(gold.words)))
"State at failure:\n"
"%s" % (self.n_moves, stcls.print_state(gold.words)))
assert n_gold >= 1 assert n_gold >= 1
def get_beam_annot(self, Beam beam): def get_beam_annot(self, Beam beam):
@ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem):
deps[j].setdefault(dep, 0.0) deps[j].setdefault(dep, 0.0)
deps[j][dep] += prob deps[j][dep] += prob
return heads, deps return heads, deps

View File

@ -4,17 +4,12 @@ from __future__ import unicode_literals
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
from collections import OrderedDict from collections import OrderedDict
import numpy
from thinc.neural.ops import NumpyOps
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from .transition_system cimport Transition from .transition_system cimport Transition
from .transition_system cimport do_func_t from .transition_system cimport do_func_t
from ..structs cimport TokenC, Entity from ..gold cimport GoldParseC, GoldParse
from ..gold cimport GoldParseC
from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
cdef enum: cdef enum:
@ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem):
@classmethod @classmethod
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):
actions = kwargs.get('actions', actions = kwargs.get('actions', OrderedDict((
OrderedDict(( (MISSING, ['']),
(MISSING, ['']), (BEGIN, []),
(BEGIN, []), (IN, []),
(IN, []), (LAST, []),
(LAST, []), (UNIT, []),
(UNIT, []), (OUT, [''])
(OUT, ['']) )))
)))
seen_entities = set() seen_entities = set()
for entity_type in kwargs.get('entity_types', []): for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities: if entity_type in seen_entities:
@ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem):
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label cdef attr_t label
if name == '-' or name == None: if name == '-' or name is None:
return Transition(clas=0, move=MISSING, label=0, score=0) return Transition(clas=0, move=MISSING, label=0, score=0)
elif name == '!O': elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0) return Transition(clas=0, move=ISNT, label=0, score=0)
@ -328,8 +322,8 @@ cdef class In:
return False return False
elif preset_ent_iob == 3: elif preset_ent_iob == 3:
return False return False
# TODO: Is this quite right? # TODO: Is this quite right? I think it's supposed to be ensuring the
# I think it's supposed to be ensuring the gazetteer matches are maintained # gazetteer matches are maintained
elif st.B_(1).ent_iob != preset_ent_iob: elif st.B_(1).ent_iob != preset_ent_iob:
return False return False
# Don't allow entities to extend across sentence boundaries # Don't allow entities to extend across sentence boundaries
@ -354,10 +348,12 @@ cdef class In:
if g_act == MISSING: if g_act == MISSING:
return 0 return 0
elif g_act == BEGIN: elif g_act == BEGIN:
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk) # I, Gold B --> True
# (P of bad open entity sunk, R of this entity sunk)
return 0 return 0
elif g_act == IN: elif g_act == IN:
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk) # I, Gold I --> True
# (label forced by prev, if mismatch, P and R both sunk)
return 0 return 0
elif g_act == LAST: elif g_act == LAST:
# I, Gold L --> True iff this entity sunk and next tag == O # I, Gold L --> True iff this entity sunk and next tag == O
@ -505,11 +501,3 @@ cdef class Out:
return 1 return 1
else: else:
return 1 return 1
class OracleError(Exception):
pass
class UnknownMove(Exception):
pass

View File

@ -5,71 +5,48 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
from collections import Counter, OrderedDict from collections import OrderedDict
import ujson import ujson
import json import json
import contextlib
import numpy import numpy
from libc.math cimport exp
cimport cython
cimport cython.parallel cimport cython.parallel
import cytoolz import cytoolz
import dill
import numpy.random import numpy.random
cimport numpy as np cimport numpy as np
from cpython.ref cimport PyObject, Py_XDECREF
from libcpp.vector cimport vector
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.stdint cimport uint32_t, uint64_t from libc.math cimport exp
from libc.string cimport memset, memcpy from libcpp.vector cimport vector
from libc.stdlib cimport malloc, calloc, free from libc.string cimport memset
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t from libc.stdlib cimport calloc, free
from thinc.linear.avgtron cimport AveragedPerceptron from cymem.cymem cimport Pool
from thinc.linalg cimport Vec, VecVec from thinc.typedefs cimport weight_t, class_t, hash_t
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
from thinc.extra.eg cimport Example
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
from thinc.api import chain, clone
from cymem.cymem cimport Pool, Address from thinc.v2v import Model, Maxout, Affine
from murmurhash.mrmr cimport hash64
from preshed.maps cimport MapStruct
from preshed.maps cimport map_get
from thinc.api import layerize, chain, clone, with_flatten
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
from thinc.misc import LayerNorm from thinc.misc import LayerNorm
from thinc.neural.ops import CupyOps
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
from .. import util from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten
from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch
from .._ml import Residual, flatten
from .._ml import link_vectors_to_models from .._ml import link_vectors_to_models
from ..compat import json_dumps, copy_array from ..compat import json_dumps, copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
from .. import util
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
from . import nonproj from .transition_system cimport Transition
from .transition_system import OracleError from . import _beam_utils, nonproj
from .transition_system cimport TransitionSystem, Transition
from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
from ..gold cimport GoldParse
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
from . import _beam_utils
def get_templates(*args, **kwargs): def get_templates(*args, **kwargs):
return [] return []
DEBUG = False DEBUG = False
def set_debug(val): def set_debug(val):
global DEBUG global DEBUG
DEBUG = val DEBUG = val
@ -100,7 +77,8 @@ cdef class precompute_hiddens:
cdef object _cuda_stream cdef object _cuda_stream
cdef object _bp_hiddens cdef object _bp_hiddens
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.): def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
drop=0.):
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop) gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
cdef np.ndarray cached cdef np.ndarray cached
if not isinstance(gpu_cached, numpy.ndarray): if not isinstance(gpu_cached, numpy.ndarray):
@ -120,8 +98,7 @@ cdef class precompute_hiddens:
self._bp_hiddens = bp_features self._bp_hiddens = bp_features
cdef const float* get_feat_weights(self) except NULL: cdef const float* get_feat_weights(self) except NULL:
if not self._is_synchronized \ if not self._is_synchronized and self._cuda_stream is not None:
and self._cuda_stream is not None:
self._cuda_stream.synchronize() self._cuda_stream.synchronize()
self._is_synchronized = True self._is_synchronized = True
return <float*>self._cached.data return <float*>self._cached.data
@ -130,7 +107,8 @@ cdef class precompute_hiddens:
return self.begin_update(X)[0] return self.begin_update(X)[0]
def begin_update(self, token_ids, drop=0.): def begin_update(self, token_ids, drop=0.):
cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f') cdef np.ndarray state_vector = numpy.zeros(
(token_ids.shape[0], self.nO*self.nP), dtype='f')
# This is tricky, but (assuming GPU available); # This is tricky, but (assuming GPU available);
# - Input to forward on CPU # - Input to forward on CPU
# - Output from forward on CPU # - Output from forward on CPU
@ -141,8 +119,8 @@ cdef class precompute_hiddens:
feat_weights = self.get_feat_weights() feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids cdef int[:, ::1] ids = token_ids
sum_state_features(<float*>state_vector.data, sum_state_features(<float*>state_vector.data,
feat_weights, &ids[0,0], feat_weights, &ids[0, 0],
token_ids.shape[0], self.nF, self.nO*self.nP) token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector, bp_nonlinearity = self._nonlinearity(state_vector) state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector, sgd=None): def backward(d_state_vector, sgd=None):
@ -161,10 +139,11 @@ cdef class precompute_hiddens:
state_vector = state_vector.reshape( state_vector = state_vector.reshape(
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP)) (state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
best, which = self.ops.maxout(state_vector) best, which = self.ops.maxout(state_vector)
def backprop(d_best, sgd=None): def backprop(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP) return self.ops.backprop_maxout(d_best, which, self.nP)
return best, backprop
return best, backprop
cdef void sum_state_features(float* output, cdef void sum_state_features(float* output,
@ -239,11 +218,15 @@ cdef class Parser:
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
if depth != 1: if depth != 1:
raise ValueError("Currently parser depth is hard-coded to 1.") raise ValueError("Currently parser depth is hard-coded to 1.")
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2)) parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
cfg.get('maxout_pieces', 2))
if parser_maxout_pieces != 2: if parser_maxout_pieces != 2:
raise ValueError("Currently parser_maxout_pieces is hard-coded to 2") raise ValueError("Currently parser_maxout_pieces is hard-coded "
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128)) "to 2")
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200)) token_vector_width = util.env_opt('token_vector_width',
cfg.get('token_vector_width', 128))
hidden_width = util.env_opt('hidden_width',
cfg.get('hidden_width', 200))
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
@ -365,8 +348,8 @@ cdef class Parser:
parse_states = self.parse_batch(subbatch) parse_states = self.parse_batch(subbatch)
beams = [] beams = []
else: else:
beams = self.beam_parse(subbatch, beams = self.beam_parse(subbatch, beam_width=beam_width,
beam_width=beam_width, beam_density=beam_density) beam_density=beam_density)
parse_states = [] parse_states = []
for beam in beams: for beam in beams:
parse_states.append(<StateClass>beam.at(0)) parse_states.append(<StateClass>beam.at(0))
@ -386,9 +369,9 @@ cdef class Parser:
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
cuda_stream = get_cuda_stream() cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
0.0) docs, cuda_stream, 0.0)
nr_state = len(docs) nr_state = len(docs)
nr_class = self.moves.n_moves nr_class = self.moves.n_moves
nr_dim = tokvecs.shape[1] nr_dim = tokvecs.shape[1]
@ -402,7 +385,8 @@ cdef class Parser:
feat_weights = state2vec.get_feat_weights() feat_weights = state2vec.get_feat_weights()
cdef int i cdef int i
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T) cdef np.ndarray hidden_weights = numpy.ascontiguousarray(
vec2scores._layers[-1].W.T)
cdef np.ndarray hidden_bias = vec2scores._layers[-1].b cdef np.ndarray hidden_bias = vec2scores._layers[-1].b
hW = <float*>hidden_weights.data hW = <float*>hidden_weights.data
@ -462,9 +446,9 @@ cdef class Parser:
cdef Doc doc cdef Doc doc
cdef int nr_class = self.moves.n_moves cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output cdef StateClass stcls, output
cuda_stream = get_cuda_stream() cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
0.0) docs, cuda_stream, 0.0)
beams = [] beams = []
cdef int offset = 0 cdef int offset = 0
cdef int j = 0 cdef int j = 0
@ -519,9 +503,7 @@ cdef class Parser:
if isinstance(docs, Doc) and isinstance(golds, GoldParse): if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs] docs = [docs]
golds = [golds] golds = [golds]
cuda_stream = util.get_cuda_stream()
cuda_stream = get_cuda_stream()
states, golds, max_steps = self._init_gold_batch(docs, golds) states, golds, max_steps = self._init_gold_batch(docs, golds)
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
drop) drop)
@ -536,7 +518,6 @@ cdef class Parser:
n_steps = 0 n_steps = 0
while todo: while todo:
states, golds = zip(*todo) states, golds = zip(*todo)
token_ids = self.get_token_ids(states) token_ids = self.get_token_ids(states)
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0) vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
if drop != 0: if drop != 0:
@ -558,8 +539,8 @@ cdef class Parser:
and not isinstance(token_ids, state2vec.ops.xp.ndarray): and not isinstance(token_ids, state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously # Move token_ids and d_vector to GPU, asynchronously
backprops.append(( backprops.append((
get_async(cuda_stream, token_ids), util.get_async(cuda_stream, token_ids),
get_async(cuda_stream, d_vector), util.get_async(cuda_stream, d_vector),
bp_vector bp_vector
)) ))
else: else:
@ -592,15 +573,13 @@ cdef class Parser:
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
for gold in golds: for gold in golds:
self.moves.preprocess_gold(gold) self.moves.preprocess_gold(gold)
cuda_stream = util.get_cuda_stream()
cuda_stream = get_cuda_stream() (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop) docs, cuda_stream, drop)
states_d_scores, backprops = _beam_utils.update_beam(
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, self.moves, self.nr_feature, 500, states, golds, state2vec,
states, golds, vec2scores, width, density, self.cfg.get('hist_size', 0),
state2vec, vec2scores, drop=drop, losses=losses)
width, density, self.cfg.get('hist_size', 0),
drop=drop, losses=losses)
backprop_lower = [] backprop_lower = []
cdef float batch_size = len(docs) cdef float batch_size = len(docs)
for i, d_scores in enumerate(states_d_scores): for i, d_scores in enumerate(states_d_scores):
@ -612,13 +591,14 @@ cdef class Parser:
if isinstance(self.model[0].ops, CupyOps) \ if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(ids, state2vec.ops.xp.ndarray): and not isinstance(ids, state2vec.ops.xp.ndarray):
backprop_lower.append(( backprop_lower.append((
get_async(cuda_stream, ids), util.get_async(cuda_stream, ids),
get_async(cuda_stream, d_vector), util.get_async(cuda_stream, d_vector),
bp_vectors)) bp_vectors))
else: else:
backprop_lower.append((ids, d_vector, bp_vectors)) backprop_lower.append((ids, d_vector, bp_vectors))
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream) self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd,
cuda_stream)
def _init_gold_batch(self, whole_docs, whole_golds): def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long """Make a square batch, of length equal to the shortest doc. A long
@ -768,7 +748,8 @@ cdef class Parser:
def begin_training(self, gold_tuples, pipeline=None, **cfg): def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg: if 'model' in cfg:
self.model = cfg['model'] self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100) gold_tuples = nonproj.preprocess_training_data(gold_tuples,
label_freq_cutoff=100)
actions = self.moves.get_actions(gold_parses=gold_tuples) actions = self.moves.get_actions(gold_parses=gold_tuples)
for action, labels in actions.items(): for action, labels in actions.items():
for label in labels: for label in labels:

View File

@ -1,39 +1,37 @@
# coding: utf-8 # coding: utf-8
""" """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme. scheme.
""" """
from __future__ import unicode_literals from __future__ import unicode_literals
from copy import copy from copy import copy
from ..tokens.doc cimport Doc
from ..attrs import DEP, HEAD
DELIMITER = '||' DELIMITER = '||'
def ancestors(tokenid, heads): def ancestors(tokenid, heads):
# returns all words going from the word up the path to the root # Returns all words going from the word up the path to the root. The path
# the path to root cannot be longer than the number of words in the sentence # to root cannot be longer than the number of words in the sentence. This
# this function ends after at most len(heads) steps # function ends after at most len(heads) steps, because it would otherwise
# because it would otherwise loop indefinitely on cycles # loop indefinitely on cycles.
head = tokenid head = tokenid
cnt = 0 cnt = 0
while heads[head] != head and cnt < len(heads): while heads[head] != head and cnt < len(heads):
head = heads[head] head = heads[head]
cnt += 1 cnt += 1
yield head yield head
if head == None: if head is None:
break break
def contains_cycle(heads): def contains_cycle(heads):
# in an acyclic tree, the path from each word following # in an acyclic tree, the path from each word following the head relation
# the head relation upwards always ends at the root node # upwards always ends at the root node
for tokenid in range(len(heads)): for tokenid in range(len(heads)):
seen = set([tokenid]) seen = set([tokenid])
for ancestor in ancestors(tokenid,heads): for ancestor in ancestors(tokenid, heads):
if ancestor in seen: if ancestor in seen:
return seen return seen
seen.add(ancestor) seen.add(ancestor)
@ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads):
# if there is a token k, h < k < d such that h is not # if there is a token k, h < k < d such that h is not
# an ancestor of k. Same for h -> d, h > d # an ancestor of k. Same for h -> d, h > d
head = heads[tokenid] head = heads[tokenid]
if head == tokenid: # root arcs cannot be non-projective if head == tokenid: # root arcs cannot be non-projective
return False return False
elif head == None: # unattached tokens cannot be non-projective elif head is None: # unattached tokens cannot be non-projective
return False return False
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head) start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
for k in range(start,end): for k in range(start, end):
for ancestor in ancestors(k,heads): for ancestor in ancestors(k, heads):
if ancestor == None: # for unattached tokens/subtrees if ancestor is None: # for unattached tokens/subtrees
break break
elif ancestor == head: # normal case: k dominated by h elif ancestor == head: # normal case: k dominated by h
break break
else: # head not in ancestors: d -> h is non-projective else: # head not in ancestors: d -> h is non-projective
return True return True
return False return False
def is_nonproj_tree(heads): def is_nonproj_tree(heads):
# a tree is non-projective if at least one arc is non-projective # a tree is non-projective if at least one arc is non-projective
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) ) return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
def decompose(label): def decompose(label):
@ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
for raw_text, sents in gold_tuples: for raw_text, sents in gold_tuples:
prepro_sents = [] prepro_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents: for (ids, words, tags, heads, labels, iob), ctnts in sents:
proj_heads,deco_labels = projectivize(heads,labels) proj_heads, deco_labels = projectivize(heads, labels)
# set the label to ROOT for each root dependent # set the label to ROOT for each root dependent
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ] deco_labels = ['ROOT' if head == i else deco_labels[i]
for i, head in enumerate(proj_heads)]
# count label frequencies # count label frequencies
if label_freq_cutoff > 0: if label_freq_cutoff > 0:
for label in deco_labels: for label in deco_labels:
if is_decorated(label): if is_decorated(label):
freqs[label] = freqs.get(label,0) + 1 freqs[label] = freqs.get(label, 0) + 1
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts)) prepro_sents.append(
((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
preprocessed.append((raw_text, prepro_sents)) preprocessed.append((raw_text, prepro_sents))
if label_freq_cutoff > 0: if label_freq_cutoff > 0:
return _filter_labels(preprocessed,label_freq_cutoff,freqs) return _filter_labels(preprocessed, label_freq_cutoff, freqs)
return preprocessed return preprocessed
def projectivize(heads, labels): def projectivize(heads, labels):
# use the algorithm by Nivre & Nilsson 2005 # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
# assumes heads to be a proper tree, i.e. connected and cycle-free # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
# returns a new pair (heads,labels) which encode # which encode a projective and decorated tree.
# a projective and decorated tree
proj_heads = copy(heads) proj_heads = copy(heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
if smallest_np_arc == None: # this sentence is already projective if smallest_np_arc is None: # this sentence is already projective
return proj_heads, copy(labels) return proj_heads, copy(labels)
while smallest_np_arc != None: while smallest_np_arc is not None:
_lift(smallest_np_arc, proj_heads) _lift(smallest_np_arc, proj_heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads) smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
deco_labels = _decorate(heads, proj_heads, labels) deco_labels = _decorate(heads, proj_heads, labels)
@ -114,24 +112,26 @@ def projectivize(heads, labels):
def deprojectivize(tokens): def deprojectivize(tokens):
# reattach arcs with decorated labels (following HEAD scheme) # Reattach arcs with decorated labels (following HEAD scheme). For each
# for each decorated arc X||Y, search top-down, left-to-right, # decorated arc X||Y, search top-down, left-to-right, breadth-first until
# breadth-first until hitting a Y then make this the new head # hitting a Y then make this the new head.
for token in tokens: for token in tokens:
if is_decorated(token.dep_): if is_decorated(token.dep_):
newlabel,headlabel = decompose(token.dep_) newlabel, headlabel = decompose(token.dep_)
newhead = _find_new_head(token,headlabel) newhead = _find_new_head(token, headlabel)
token.head = newhead token.head = newhead
token.dep_ = newlabel token.dep_ = newlabel
return tokens return tokens
def _decorate(heads, proj_heads, labels): def _decorate(heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005 # uses decoration scheme HEAD from Nivre & Nilsson 2005
assert(len(heads) == len(proj_heads) == len(labels)) assert(len(heads) == len(proj_heads) == len(labels))
deco_labels = [] deco_labels = []
for tokenid,head in enumerate(heads): for tokenid, head in enumerate(heads):
if head != proj_heads[tokenid]: if head != proj_heads[tokenid]:
deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head])) deco_labels.append(
'%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
else: else:
deco_labels.append(labels[tokenid]) deco_labels.append(labels[tokenid])
return deco_labels return deco_labels
@ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads):
# and ties are broken left to right # and ties are broken left to right
smallest_size = float('inf') smallest_size = float('inf')
smallest_np_arc = None smallest_np_arc = None
for tokenid,head in enumerate(heads): for tokenid, head in enumerate(heads):
size = abs(tokenid-head) size = abs(tokenid-head)
if size < smallest_size and is_nonproj_arc(tokenid,heads): if size < smallest_size and is_nonproj_arc(tokenid, heads):
smallest_size = size smallest_size = size
smallest_np_arc = tokenid smallest_np_arc = tokenid
return smallest_np_arc return smallest_np_arc
@ -168,8 +168,10 @@ def _find_new_head(token, headlabel):
next_queue = [] next_queue = []
for qtoken in queue: for qtoken in queue:
for child in qtoken.children: for child in qtoken.children:
if child.is_space: continue if child.is_space:
if child == token: continue continue
if child == token:
continue
if child.dep_ == headlabel: if child.dep_ == headlabel:
return child return child
next_queue.append(child) next_queue.append(child)
@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs):
for raw_text, sents in gold_tuples: for raw_text, sents in gold_tuples:
filtered_sents = [] filtered_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents: for (ids, words, tags, heads, labels, iob), ctnts in sents:
filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ] filtered_labels = [decompose(label)[0]
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) if freqs.get(label, cutoff) < cutoff
else label for label in labels]
filtered_sents.append(
((ids, words, tags, heads, filtered_labels, iob), ctnts))
filtered.append((raw_text, filtered_sents)) filtered.append((raw_text, filtered_sents))
return filtered return filtered

View File

@ -2,17 +2,8 @@
# cython: infer_types=True # cython: infer_types=True
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t, uint64_t
import numpy import numpy
from ..vocab cimport EMPTY_LEXEME
from ..structs cimport Entity
from ..lexeme cimport Lexeme
from ..symbols cimport punct
from ..attrs cimport IS_SPACE
from ..attrs cimport attr_id_t
from ..tokens.token cimport Token
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc

View File

@ -2,17 +2,17 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t from thinc.typedefs cimport weight_t
from collections import defaultdict, OrderedDict from collections import OrderedDict
import ujson import ujson
from .. import util
from ..structs cimport TokenC from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from ..compat import json_dumps
from .. import util
cdef weight_t MIN_SCORE = -90000 cdef weight_t MIN_SCORE = -90000
@ -136,11 +136,12 @@ cdef class TransitionSystem:
print([gold.c.ner[i].clas for i in range(gold.length)]) print([gold.c.ner[i].clas for i in range(gold.length)])
print([gold.c.ner[i].move for i in range(gold.length)]) print([gold.c.ner[i].move for i in range(gold.length)])
print([gold.c.ner[i].label for i in range(gold.length)]) print([gold.c.ner[i].label for i in range(gold.length)])
print("Self labels", [self.c[i].label for i in range(self.n_moves)]) print("Self labels",
[self.c[i].label for i in range(self.n_moves)])
raise ValueError( raise ValueError(
"Could not find a gold-standard action to supervise " "Could not find a gold-standard action to supervise "
"the entity recognizer\n" "the entity recognizer. The transition system has "
"The transition system has %d actions." % (self.n_moves)) "%d actions." % (self.n_moves))
def get_class_name(self, int clas): def get_class_name(self, int clas):
act = self.c[clas] act = self.c[clas]
@ -149,7 +150,7 @@ cdef class TransitionSystem:
def add_action(self, int action, label_name): def add_action(self, int action, label_name):
cdef attr_t label_id cdef attr_t label_id
if not isinstance(label_name, int) and \ if not isinstance(label_name, int) and \
not isinstance(label_name, long): not isinstance(label_name, long):
label_id = self.strings.add(label_name) label_id = self.strings.add(label_name)
else: else:
label_id = label_name label_id = label_name
@ -186,7 +187,7 @@ cdef class TransitionSystem:
'name': self.move_name(trans.move, trans.label) 'name': self.move_name(trans.move, trans.label)
}) })
serializers = { serializers = {
'transitions': lambda: ujson.dumps(transitions), 'transitions': lambda: json_dumps(transitions),
'strings': lambda: self.strings.to_bytes() 'strings': lambda: self.strings.to_bytes()
} }
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)