Tidy up syntax

This commit is contained in:
ines 2017-10-27 19:45:57 +02:00
parent 5167a0cce2
commit b4d226a3f1
8 changed files with 195 additions and 230 deletions

View File

@ -2,7 +2,7 @@
# cython: profile=True
cimport numpy as np
import numpy
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.ref cimport PyObject, Py_XDECREF
from thinc.extra.search cimport Beam
from thinc.extra.search import MaxViolation
from thinc.typedefs cimport hash_t, class_t
@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation
from .transition_system cimport TransitionSystem, Transition
from .stateclass cimport StateClass
from ..gold cimport GoldParse
from ..tokens.doc cimport Doc
# These are passed as callbacks to thinc.search.Beam
@ -50,7 +49,7 @@ cdef class ParserBeam(object):
cdef public object dones
def __init__(self, TransitionSystem moves, states, golds,
int width, float density):
int width, float density):
self.moves = moves
self.states = states
self.golds = golds
@ -59,7 +58,8 @@ cdef class ParserBeam(object):
cdef StateClass state, st
for state in states:
beam = Beam(self.moves.n_moves, width, density)
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
beam.initialize(self.moves.init_beam_state, state.c.length,
state.c._sent)
for i in range(beam.width):
st = <StateClass>beam.at(i)
st.c.offset = state.c.offset
@ -74,7 +74,8 @@ cdef class ParserBeam(object):
@property
def is_done(self):
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
return all(b.is_done or self.dones[i]
for i, b in enumerate(self.beams))
def __getitem__(self, i):
return self.beams[i]
@ -126,7 +127,8 @@ cdef class ParserBeam(object):
for i in range(beam.size):
state = <StateClass>beam.at(i)
if not state.c.is_final():
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
state, gold)
if follow_gold:
for j in range(beam.nr_class):
if beam.costs[i][j] >= 1:
@ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens):
c_ids += ids.shape[1]
return ids
nr_update = 0
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
states, golds,
state2vec, vec2scores,
@ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
if pbeam.is_done and gbeam.is_done:
break
# The beam maps let us find the right row in the flattened scores
# arrays for each state. States are identified by (example id, history).
# We keep a different beam map for each step (since we'll have a flat
# scores array for each step). The beam map will let us take the per-state
# losses, and compute the gradient for each (step, state, class).
# arrays for each state. States are identified by (example id,
# history). We keep a different beam map for each step (since we'll
# have a flat scores array for each step). The beam map will let us
# take the per-state losses, and compute the gradient for each (step,
# state, class).
beam_maps.append({})
# Gather all states from the two beams in a list. Some stats may occur
# in both beams. To figure out which beam each state belonged to,
# we keep two lists of indices, p_indices and g_indices
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
nr_update)
if not states:
break
# Now that we have our flat list of states, feed them through the model
token_ids = get_token_ids(states, nr_feature)
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
if hist_feats:
hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i')
scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop)
hists = numpy.asarray([st.history[:hist_feats] for st in states],
dtype='i')
scores, bp_scores = vec2scores.begin_update((vectors, hists),
drop=drop)
else:
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
@ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
# Unpack the flat scores into lists for the two beams. The indices arrays
# tell us which example and state the scores-row refers to.
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
for indices in p_indices]
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
for indices in g_indices]
# Now advance the states in the beams. The gold beam is contrained to
# to follow only gold analyses.
pbeam.advance(p_scores)
@ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
def get_gradient(nr_class, beam_maps, histories, losses):
"""
The global model assigns a loss to each parse. The beam scores
"""The global model assigns a loss to each parse. The beam scores
are additive, so the same gradient is applied to each action
in the history. This gives the gradient of a single *action*
for a beam state -- so we have "the gradient of loss for taking
@ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
if loss != 0.0 and not numpy.isnan(loss):
nr_step = max(nr_step, len(hist))
for i in range(nr_step):
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
dtype='f'))
assert len(histories) == len(losses)
for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists):
@ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses):
grads[j][i, clas] += loss
key = key + tuple([clas])
return grads

View File

@ -1 +0,0 @@
# test

View File

@ -4,24 +4,16 @@
# coding: utf-8
from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
import ctypes
from libc.stdint cimport uint32_t
from libc.string cimport memcpy
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from collections import OrderedDict
from thinc.extra.search cimport Beam
import numpy
from .stateclass cimport StateClass
from ._state cimport StateC, is_space_token
from ._state cimport StateC
from .nonproj import is_nonproj_tree
from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
from ..lexeme cimport Lexeme
from ..gold cimport GoldParse, GoldParseC
from ..structs cimport TokenC
@ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem):
@classmethod
def get_actions(cls, **kwargs):
actions = kwargs.get('actions',
OrderedDict((
(SHIFT, ['']),
(REDUCE, ['']),
(RIGHT, []),
(LEFT, []),
(BREAK, ['ROOT'])
)))
actions = kwargs.get('actions', OrderedDict((
(SHIFT, ['']),
(REDUCE, ['']),
(RIGHT, []),
(LEFT, []),
(BREAK, ['ROOT']))
))
seen_actions = set()
for label in kwargs.get('left_labels', []):
if label.upper() != 'ROOT':
@ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem):
if gold.cand_to_gold[i] is None:
continue
if state.safe_get(i).dep:
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
predicted.add((i, state.H(i),
self.strings[state.safe_get(i).dep]))
else:
predicted.add((i, state.H(i), 'ROOT'))
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
@ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem):
if not self.has_gold(gold):
return None
for i in range(gold.length):
if gold.heads[i] is None or gold.labels[i] is None: # Missing values
# Missing values
if gold.heads[i] is None or gold.labels[i] is None:
gold.c.heads[i] = i
gold.c.has_dep[i] = False
else:
@ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem):
# Check projectivity --- leading cause
if is_nonproj_tree(gold.heads):
raise ValueError(
"Could not find a gold-standard action to supervise the dependency "
"parser.\n"
"Likely cause: the tree is non-projective (i.e. it has crossing "
"arcs -- see spacy/syntax/nonproj.pyx for definitions)\n"
"The ArcEager transition system only supports projective trees.\n"
"To learn non-projective representations, transform the data "
"before training and after parsing. Either pass make_projective=True "
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
"Could not find a gold-standard action to supervise the "
"dependency parser. Likely cause: the tree is "
"non-projective (i.e. it has crossing arcs -- see "
"spacy/syntax/nonproj.pyx for definitions). The ArcEager "
"transition system only supports projective trees. To "
"learn non-projective representations, transform the data "
"before training and after parsing. Either pass "
"make_projective=True to the GoldParse class, or use "
"spacy.syntax.nonproj.preprocess_training_data.")
else:
print(gold.orig_annot)
print(gold.words)
@ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem):
print(gold.labels)
print(gold.sent_starts)
raise ValueError(
"Could not find a gold-standard action to supervise the dependency "
"parser.\n"
"The GoldParse was projective.\n"
"The transition system has %d actions.\n"
"State at failure:\n"
"%s" % (self.n_moves, stcls.print_state(gold.words)))
"Could not find a gold-standard action to supervise the"
"dependency parser. The GoldParse was projective. The "
"transition system has %d actions. State at failure: %s"
% (self.n_moves, stcls.print_state(gold.words)))
assert n_gold >= 1
def get_beam_annot(self, Beam beam):
@ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem):
deps[j].setdefault(dep, 0.0)
deps[j][dep] += prob
return heads, deps

View File

@ -4,17 +4,12 @@ from __future__ import unicode_literals
from thinc.typedefs cimport weight_t
from thinc.extra.search cimport Beam
from collections import OrderedDict
import numpy
from thinc.neural.ops import NumpyOps
from .stateclass cimport StateClass
from ._state cimport StateC
from .transition_system cimport Transition
from .transition_system cimport do_func_t
from ..structs cimport TokenC, Entity
from ..gold cimport GoldParseC
from ..gold cimport GoldParse
from ..attrs cimport ENT_TYPE, ENT_IOB
from ..gold cimport GoldParseC, GoldParse
cdef enum:
@ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem):
@classmethod
def get_actions(cls, **kwargs):
actions = kwargs.get('actions',
OrderedDict((
(MISSING, ['']),
(BEGIN, []),
(IN, []),
(LAST, []),
(UNIT, []),
(OUT, [''])
)))
actions = kwargs.get('actions', OrderedDict((
(MISSING, ['']),
(BEGIN, []),
(IN, []),
(LAST, []),
(UNIT, []),
(OUT, [''])
)))
seen_entities = set()
for entity_type in kwargs.get('entity_types', []):
if entity_type in seen_entities:
@ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem):
cdef Transition lookup_transition(self, object name) except *:
cdef attr_t label
if name == '-' or name == None:
if name == '-' or name is None:
return Transition(clas=0, move=MISSING, label=0, score=0)
elif name == '!O':
return Transition(clas=0, move=ISNT, label=0, score=0)
@ -328,8 +322,8 @@ cdef class In:
return False
elif preset_ent_iob == 3:
return False
# TODO: Is this quite right?
# I think it's supposed to be ensuring the gazetteer matches are maintained
# TODO: Is this quite right? I think it's supposed to be ensuring the
# gazetteer matches are maintained
elif st.B_(1).ent_iob != preset_ent_iob:
return False
# Don't allow entities to extend across sentence boundaries
@ -354,10 +348,12 @@ cdef class In:
if g_act == MISSING:
return 0
elif g_act == BEGIN:
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
# I, Gold B --> True
# (P of bad open entity sunk, R of this entity sunk)
return 0
elif g_act == IN:
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
# I, Gold I --> True
# (label forced by prev, if mismatch, P and R both sunk)
return 0
elif g_act == LAST:
# I, Gold L --> True iff this entity sunk and next tag == O
@ -505,11 +501,3 @@ cdef class Out:
return 1
else:
return 1
class OracleError(Exception):
pass
class UnknownMove(Exception):
pass

View File

@ -5,71 +5,48 @@
# coding: utf-8
from __future__ import unicode_literals, print_function
from collections import Counter, OrderedDict
from collections import OrderedDict
import ujson
import json
import contextlib
import numpy
from libc.math cimport exp
cimport cython
cimport cython.parallel
import cytoolz
import dill
import numpy.random
cimport numpy as np
from libcpp.vector cimport vector
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.ref cimport PyObject, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.stdint cimport uint32_t, uint64_t
from libc.string cimport memset, memcpy
from libc.stdlib cimport malloc, calloc, free
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport Vec, VecVec
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
from thinc.extra.eg cimport Example
from libc.math cimport exp
from libcpp.vector cimport vector
from libc.string cimport memset
from libc.stdlib cimport calloc, free
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t, class_t, hash_t
from thinc.extra.search cimport Beam
from cymem.cymem cimport Pool, Address
from murmurhash.mrmr cimport hash64
from preshed.maps cimport MapStruct
from preshed.maps cimport map_get
from thinc.api import layerize, chain, clone, with_flatten
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
from thinc.api import chain, clone
from thinc.v2v import Model, Maxout, Affine
from thinc.misc import LayerNorm
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.ops import CupyOps
from thinc.neural.util import get_array_module
from .. import util
from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch
from .._ml import Residual, flatten
from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten
from .._ml import link_vectors_to_models
from ..compat import json_dumps, copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
from .. import util
from .stateclass cimport StateClass
from ._state cimport StateC
from . import nonproj
from .transition_system import OracleError
from .transition_system cimport TransitionSystem, Transition
from ..structs cimport TokenC
from ..tokens.doc cimport Doc
from ..strings cimport StringStore
from ..gold cimport GoldParse
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
from . import _beam_utils
from .transition_system cimport Transition
from . import _beam_utils, nonproj
def get_templates(*args, **kwargs):
return []
DEBUG = False
def set_debug(val):
global DEBUG
DEBUG = val
@ -100,7 +77,8 @@ cdef class precompute_hiddens:
cdef object _cuda_stream
cdef object _bp_hiddens
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.):
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
drop=0.):
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
cdef np.ndarray cached
if not isinstance(gpu_cached, numpy.ndarray):
@ -120,8 +98,7 @@ cdef class precompute_hiddens:
self._bp_hiddens = bp_features
cdef const float* get_feat_weights(self) except NULL:
if not self._is_synchronized \
and self._cuda_stream is not None:
if not self._is_synchronized and self._cuda_stream is not None:
self._cuda_stream.synchronize()
self._is_synchronized = True
return <float*>self._cached.data
@ -130,7 +107,8 @@ cdef class precompute_hiddens:
return self.begin_update(X)[0]
def begin_update(self, token_ids, drop=0.):
cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
cdef np.ndarray state_vector = numpy.zeros(
(token_ids.shape[0], self.nO*self.nP), dtype='f')
# This is tricky, but (assuming GPU available);
# - Input to forward on CPU
# - Output from forward on CPU
@ -141,8 +119,8 @@ cdef class precompute_hiddens:
feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids
sum_state_features(<float*>state_vector.data,
feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP)
feat_weights, &ids[0, 0],
token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector, sgd=None):
@ -161,10 +139,11 @@ cdef class precompute_hiddens:
state_vector = state_vector.reshape(
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
best, which = self.ops.maxout(state_vector)
def backprop(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP)
return best, backprop
return best, backprop
cdef void sum_state_features(float* output,
@ -239,11 +218,15 @@ cdef class Parser:
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
if depth != 1:
raise ValueError("Currently parser depth is hard-coded to 1.")
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
cfg.get('maxout_pieces', 2))
if parser_maxout_pieces != 2:
raise ValueError("Currently parser_maxout_pieces is hard-coded to 2")
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
raise ValueError("Currently parser_maxout_pieces is hard-coded "
"to 2")
token_vector_width = util.env_opt('token_vector_width',
cfg.get('token_vector_width', 128))
hidden_width = util.env_opt('hidden_width',
cfg.get('hidden_width', 200))
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
@ -365,8 +348,8 @@ cdef class Parser:
parse_states = self.parse_batch(subbatch)
beams = []
else:
beams = self.beam_parse(subbatch,
beam_width=beam_width, beam_density=beam_density)
beams = self.beam_parse(subbatch, beam_width=beam_width,
beam_density=beam_density)
parse_states = []
for beam in beams:
parse_states.append(<StateClass>beam.at(0))
@ -386,9 +369,9 @@ cdef class Parser:
if isinstance(docs, Doc):
docs = [docs]
cuda_stream = get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
0.0)
cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
docs, cuda_stream, 0.0)
nr_state = len(docs)
nr_class = self.moves.n_moves
nr_dim = tokvecs.shape[1]
@ -402,7 +385,8 @@ cdef class Parser:
feat_weights = state2vec.get_feat_weights()
cdef int i
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T)
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(
vec2scores._layers[-1].W.T)
cdef np.ndarray hidden_bias = vec2scores._layers[-1].b
hW = <float*>hidden_weights.data
@ -462,9 +446,9 @@ cdef class Parser:
cdef Doc doc
cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output
cuda_stream = get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
0.0)
cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
docs, cuda_stream, 0.0)
beams = []
cdef int offset = 0
cdef int j = 0
@ -519,9 +503,7 @@ cdef class Parser:
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs]
golds = [golds]
cuda_stream = get_cuda_stream()
cuda_stream = util.get_cuda_stream()
states, golds, max_steps = self._init_gold_batch(docs, golds)
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
drop)
@ -536,7 +518,6 @@ cdef class Parser:
n_steps = 0
while todo:
states, golds = zip(*todo)
token_ids = self.get_token_ids(states)
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
if drop != 0:
@ -558,8 +539,8 @@ cdef class Parser:
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously
backprops.append((
get_async(cuda_stream, token_ids),
get_async(cuda_stream, d_vector),
util.get_async(cuda_stream, token_ids),
util.get_async(cuda_stream, d_vector),
bp_vector
))
else:
@ -592,15 +573,13 @@ cdef class Parser:
states = self.moves.init_batch(docs)
for gold in golds:
self.moves.preprocess_gold(gold)
cuda_stream = get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
states, golds,
state2vec, vec2scores,
width, density, self.cfg.get('hist_size', 0),
drop=drop, losses=losses)
cuda_stream = util.get_cuda_stream()
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
docs, cuda_stream, drop)
states_d_scores, backprops = _beam_utils.update_beam(
self.moves, self.nr_feature, 500, states, golds, state2vec,
vec2scores, width, density, self.cfg.get('hist_size', 0),
drop=drop, losses=losses)
backprop_lower = []
cdef float batch_size = len(docs)
for i, d_scores in enumerate(states_d_scores):
@ -612,13 +591,14 @@ cdef class Parser:
if isinstance(self.model[0].ops, CupyOps) \
and not isinstance(ids, state2vec.ops.xp.ndarray):
backprop_lower.append((
get_async(cuda_stream, ids),
get_async(cuda_stream, d_vector),
util.get_async(cuda_stream, ids),
util.get_async(cuda_stream, d_vector),
bp_vectors))
else:
backprop_lower.append((ids, d_vector, bp_vectors))
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd,
cuda_stream)
def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long
@ -768,7 +748,8 @@ cdef class Parser:
def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg:
self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
gold_tuples = nonproj.preprocess_training_data(gold_tuples,
label_freq_cutoff=100)
actions = self.moves.get_actions(gold_parses=gold_tuples)
for action, labels in actions.items():
for label in labels:

View File

@ -1,39 +1,37 @@
# coding: utf-8
"""
Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
for doing pseudo-projective parsing implementation uses the HEAD decoration
scheme.
"""
from __future__ import unicode_literals
from copy import copy
from ..tokens.doc cimport Doc
from ..attrs import DEP, HEAD
DELIMITER = '||'
def ancestors(tokenid, heads):
# returns all words going from the word up the path to the root
# the path to root cannot be longer than the number of words in the sentence
# this function ends after at most len(heads) steps
# because it would otherwise loop indefinitely on cycles
# Returns all words going from the word up the path to the root. The path
# to root cannot be longer than the number of words in the sentence. This
# function ends after at most len(heads) steps, because it would otherwise
# loop indefinitely on cycles.
head = tokenid
cnt = 0
while heads[head] != head and cnt < len(heads):
head = heads[head]
cnt += 1
yield head
if head == None:
if head is None:
break
def contains_cycle(heads):
# in an acyclic tree, the path from each word following
# the head relation upwards always ends at the root node
# in an acyclic tree, the path from each word following the head relation
# upwards always ends at the root node
for tokenid in range(len(heads)):
seen = set([tokenid])
for ancestor in ancestors(tokenid,heads):
for ancestor in ancestors(tokenid, heads):
if ancestor in seen:
return seen
seen.add(ancestor)
@ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads):
# if there is a token k, h < k < d such that h is not
# an ancestor of k. Same for h -> d, h > d
head = heads[tokenid]
if head == tokenid: # root arcs cannot be non-projective
if head == tokenid: # root arcs cannot be non-projective
return False
elif head == None: # unattached tokens cannot be non-projective
elif head is None: # unattached tokens cannot be non-projective
return False
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
for k in range(start,end):
for ancestor in ancestors(k,heads):
if ancestor == None: # for unattached tokens/subtrees
for k in range(start, end):
for ancestor in ancestors(k, heads):
if ancestor is None: # for unattached tokens/subtrees
break
elif ancestor == head: # normal case: k dominated by h
elif ancestor == head: # normal case: k dominated by h
break
else: # head not in ancestors: d -> h is non-projective
else: # head not in ancestors: d -> h is non-projective
return True
return False
def is_nonproj_tree(heads):
# a tree is non-projective if at least one arc is non-projective
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
def decompose(label):
@ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
for raw_text, sents in gold_tuples:
prepro_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents:
proj_heads,deco_labels = projectivize(heads,labels)
proj_heads, deco_labels = projectivize(heads, labels)
# set the label to ROOT for each root dependent
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
deco_labels = ['ROOT' if head == i else deco_labels[i]
for i, head in enumerate(proj_heads)]
# count label frequencies
if label_freq_cutoff > 0:
for label in deco_labels:
if is_decorated(label):
freqs[label] = freqs.get(label,0) + 1
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
freqs[label] = freqs.get(label, 0) + 1
prepro_sents.append(
((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
preprocessed.append((raw_text, prepro_sents))
if label_freq_cutoff > 0:
return _filter_labels(preprocessed,label_freq_cutoff,freqs)
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
return preprocessed
def projectivize(heads, labels):
# use the algorithm by Nivre & Nilsson 2005
# assumes heads to be a proper tree, i.e. connected and cycle-free
# returns a new pair (heads,labels) which encode
# a projective and decorated tree
# Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
# which encode a projective and decorated tree.
proj_heads = copy(heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
if smallest_np_arc == None: # this sentence is already projective
if smallest_np_arc is None: # this sentence is already projective
return proj_heads, copy(labels)
while smallest_np_arc != None:
while smallest_np_arc is not None:
_lift(smallest_np_arc, proj_heads)
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
deco_labels = _decorate(heads, proj_heads, labels)
@ -114,24 +112,26 @@ def projectivize(heads, labels):
def deprojectivize(tokens):
# reattach arcs with decorated labels (following HEAD scheme)
# for each decorated arc X||Y, search top-down, left-to-right,
# breadth-first until hitting a Y then make this the new head
# Reattach arcs with decorated labels (following HEAD scheme). For each
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
# hitting a Y then make this the new head.
for token in tokens:
if is_decorated(token.dep_):
newlabel,headlabel = decompose(token.dep_)
newhead = _find_new_head(token,headlabel)
newlabel, headlabel = decompose(token.dep_)
newhead = _find_new_head(token, headlabel)
token.head = newhead
token.dep_ = newlabel
return tokens
def _decorate(heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005
assert(len(heads) == len(proj_heads) == len(labels))
deco_labels = []
for tokenid,head in enumerate(heads):
for tokenid, head in enumerate(heads):
if head != proj_heads[tokenid]:
deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
deco_labels.append(
'%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
else:
deco_labels.append(labels[tokenid])
return deco_labels
@ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads):
# and ties are broken left to right
smallest_size = float('inf')
smallest_np_arc = None
for tokenid,head in enumerate(heads):
for tokenid, head in enumerate(heads):
size = abs(tokenid-head)
if size < smallest_size and is_nonproj_arc(tokenid,heads):
if size < smallest_size and is_nonproj_arc(tokenid, heads):
smallest_size = size
smallest_np_arc = tokenid
return smallest_np_arc
@ -168,8 +168,10 @@ def _find_new_head(token, headlabel):
next_queue = []
for qtoken in queue:
for child in qtoken.children:
if child.is_space: continue
if child == token: continue
if child.is_space:
continue
if child == token:
continue
if child.dep_ == headlabel:
return child
next_queue.append(child)
@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs):
for raw_text, sents in gold_tuples:
filtered_sents = []
for (ids, words, tags, heads, labels, iob), ctnts in sents:
filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
filtered_labels = [decompose(label)[0]
if freqs.get(label, cutoff) < cutoff
else label for label in labels]
filtered_sents.append(
((ids, words, tags, heads, filtered_labels, iob), ctnts))
filtered.append((raw_text, filtered_sents))
return filtered

View File

@ -2,17 +2,8 @@
# cython: infer_types=True
from __future__ import unicode_literals
from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t, uint64_t
import numpy
from ..vocab cimport EMPTY_LEXEME
from ..structs cimport Entity
from ..lexeme cimport Lexeme
from ..symbols cimport punct
from ..attrs cimport IS_SPACE
from ..attrs cimport attr_id_t
from ..tokens.token cimport Token
from ..tokens.doc cimport Doc

View File

@ -2,17 +2,17 @@
# coding: utf-8
from __future__ import unicode_literals
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
from thinc.typedefs cimport weight_t
from collections import defaultdict, OrderedDict
from collections import OrderedDict
import ujson
from .. import util
from ..structs cimport TokenC
from .stateclass cimport StateClass
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
from ..typedefs cimport attr_t
from ..compat import json_dumps
from .. import util
cdef weight_t MIN_SCORE = -90000
@ -136,11 +136,12 @@ cdef class TransitionSystem:
print([gold.c.ner[i].clas for i in range(gold.length)])
print([gold.c.ner[i].move for i in range(gold.length)])
print([gold.c.ner[i].label for i in range(gold.length)])
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
print("Self labels",
[self.c[i].label for i in range(self.n_moves)])
raise ValueError(
"Could not find a gold-standard action to supervise "
"the entity recognizer\n"
"The transition system has %d actions." % (self.n_moves))
"the entity recognizer. The transition system has "
"%d actions." % (self.n_moves))
def get_class_name(self, int clas):
act = self.c[clas]
@ -149,7 +150,7 @@ cdef class TransitionSystem:
def add_action(self, int action, label_name):
cdef attr_t label_id
if not isinstance(label_name, int) and \
not isinstance(label_name, long):
not isinstance(label_name, long):
label_id = self.strings.add(label_name)
else:
label_id = label_name
@ -186,7 +187,7 @@ cdef class TransitionSystem:
'name': self.move_name(trans.move, trans.label)
})
serializers = {
'transitions': lambda: ujson.dumps(transitions),
'transitions': lambda: json_dumps(transitions),
'strings': lambda: self.strings.to_bytes()
}
return util.to_bytes(serializers, exclude)