mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Tidy up syntax
This commit is contained in:
parent
5167a0cce2
commit
b4d226a3f1
|
@ -2,7 +2,7 @@
|
|||
# cython: profile=True
|
||||
cimport numpy as np
|
||||
import numpy
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cpython.ref cimport PyObject, Py_XDECREF
|
||||
from thinc.extra.search cimport Beam
|
||||
from thinc.extra.search import MaxViolation
|
||||
from thinc.typedefs cimport hash_t, class_t
|
||||
|
@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation
|
|||
from .transition_system cimport TransitionSystem, Transition
|
||||
from .stateclass cimport StateClass
|
||||
from ..gold cimport GoldParse
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
|
||||
# These are passed as callbacks to thinc.search.Beam
|
||||
|
@ -50,7 +49,7 @@ cdef class ParserBeam(object):
|
|||
cdef public object dones
|
||||
|
||||
def __init__(self, TransitionSystem moves, states, golds,
|
||||
int width, float density):
|
||||
int width, float density):
|
||||
self.moves = moves
|
||||
self.states = states
|
||||
self.golds = golds
|
||||
|
@ -59,7 +58,8 @@ cdef class ParserBeam(object):
|
|||
cdef StateClass state, st
|
||||
for state in states:
|
||||
beam = Beam(self.moves.n_moves, width, density)
|
||||
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
|
||||
beam.initialize(self.moves.init_beam_state, state.c.length,
|
||||
state.c._sent)
|
||||
for i in range(beam.width):
|
||||
st = <StateClass>beam.at(i)
|
||||
st.c.offset = state.c.offset
|
||||
|
@ -74,7 +74,8 @@ cdef class ParserBeam(object):
|
|||
|
||||
@property
|
||||
def is_done(self):
|
||||
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
|
||||
return all(b.is_done or self.dones[i]
|
||||
for i, b in enumerate(self.beams))
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self.beams[i]
|
||||
|
@ -126,7 +127,8 @@ cdef class ParserBeam(object):
|
|||
for i in range(beam.size):
|
||||
state = <StateClass>beam.at(i)
|
||||
if not state.c.is_final():
|
||||
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
|
||||
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
|
||||
state, gold)
|
||||
if follow_gold:
|
||||
for j in range(beam.nr_class):
|
||||
if beam.costs[i][j] >= 1:
|
||||
|
@ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens):
|
|||
c_ids += ids.shape[1]
|
||||
return ids
|
||||
|
||||
|
||||
nr_update = 0
|
||||
|
||||
|
||||
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||
states, golds,
|
||||
state2vec, vec2scores,
|
||||
|
@ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
|||
if pbeam.is_done and gbeam.is_done:
|
||||
break
|
||||
# The beam maps let us find the right row in the flattened scores
|
||||
# arrays for each state. States are identified by (example id, history).
|
||||
# We keep a different beam map for each step (since we'll have a flat
|
||||
# scores array for each step). The beam map will let us take the per-state
|
||||
# losses, and compute the gradient for each (step, state, class).
|
||||
# arrays for each state. States are identified by (example id,
|
||||
# history). We keep a different beam map for each step (since we'll
|
||||
# have a flat scores array for each step). The beam map will let us
|
||||
# take the per-state losses, and compute the gradient for each (step,
|
||||
# state, class).
|
||||
beam_maps.append({})
|
||||
# Gather all states from the two beams in a list. Some stats may occur
|
||||
# in both beams. To figure out which beam each state belonged to,
|
||||
# we keep two lists of indices, p_indices and g_indices
|
||||
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
|
||||
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
|
||||
nr_update)
|
||||
if not states:
|
||||
break
|
||||
# Now that we have our flat list of states, feed them through the model
|
||||
token_ids = get_token_ids(states, nr_feature)
|
||||
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
||||
if hist_feats:
|
||||
hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i')
|
||||
scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop)
|
||||
hists = numpy.asarray([st.history[:hist_feats] for st in states],
|
||||
dtype='i')
|
||||
scores, bp_scores = vec2scores.begin_update((vectors, hists),
|
||||
drop=drop)
|
||||
else:
|
||||
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
||||
|
||||
|
@ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
|||
|
||||
# Unpack the flat scores into lists for the two beams. The indices arrays
|
||||
# tell us which example and state the scores-row refers to.
|
||||
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
|
||||
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
|
||||
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
|
||||
for indices in p_indices]
|
||||
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
|
||||
for indices in g_indices]
|
||||
# Now advance the states in the beams. The gold beam is contrained to
|
||||
# to follow only gold analyses.
|
||||
pbeam.advance(p_scores)
|
||||
|
@ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
|
|||
|
||||
|
||||
def get_gradient(nr_class, beam_maps, histories, losses):
|
||||
"""
|
||||
The global model assigns a loss to each parse. The beam scores
|
||||
"""The global model assigns a loss to each parse. The beam scores
|
||||
are additive, so the same gradient is applied to each action
|
||||
in the history. This gives the gradient of a single *action*
|
||||
for a beam state -- so we have "the gradient of loss for taking
|
||||
|
@ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
|||
if loss != 0.0 and not numpy.isnan(loss):
|
||||
nr_step = max(nr_step, len(hist))
|
||||
for i in range(nr_step):
|
||||
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
|
||||
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
|
||||
dtype='f'))
|
||||
assert len(histories) == len(losses)
|
||||
for eg_id, hists in enumerate(histories):
|
||||
for loss, hist in zip(losses[eg_id], hists):
|
||||
|
@ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
|||
grads[j][i, clas] += loss
|
||||
key = key + tuple([clas])
|
||||
return grads
|
||||
|
||||
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
# test
|
|
@ -4,24 +4,16 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
import ctypes
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.string cimport memcpy
|
||||
from cpython.ref cimport Py_INCREF
|
||||
from cymem.cymem cimport Pool
|
||||
from collections import OrderedDict
|
||||
from thinc.extra.search cimport Beam
|
||||
import numpy
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC, is_space_token
|
||||
from ._state cimport StateC
|
||||
from .nonproj import is_nonproj_tree
|
||||
from .transition_system cimport do_func_t, get_cost_func_t
|
||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||
from ..gold cimport GoldParse
|
||||
from ..gold cimport GoldParseC
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..gold cimport GoldParse, GoldParseC
|
||||
from ..structs cimport TokenC
|
||||
|
||||
|
||||
|
@ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem):
|
|||
|
||||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions',
|
||||
OrderedDict((
|
||||
(SHIFT, ['']),
|
||||
(REDUCE, ['']),
|
||||
(RIGHT, []),
|
||||
(LEFT, []),
|
||||
(BREAK, ['ROOT'])
|
||||
)))
|
||||
actions = kwargs.get('actions', OrderedDict((
|
||||
(SHIFT, ['']),
|
||||
(REDUCE, ['']),
|
||||
(RIGHT, []),
|
||||
(LEFT, []),
|
||||
(BREAK, ['ROOT']))
|
||||
))
|
||||
seen_actions = set()
|
||||
for label in kwargs.get('left_labels', []):
|
||||
if label.upper() != 'ROOT':
|
||||
|
@ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem):
|
|||
if gold.cand_to_gold[i] is None:
|
||||
continue
|
||||
if state.safe_get(i).dep:
|
||||
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
|
||||
predicted.add((i, state.H(i),
|
||||
self.strings[state.safe_get(i).dep]))
|
||||
else:
|
||||
predicted.add((i, state.H(i), 'ROOT'))
|
||||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
||||
|
@ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem):
|
|||
if not self.has_gold(gold):
|
||||
return None
|
||||
for i in range(gold.length):
|
||||
if gold.heads[i] is None or gold.labels[i] is None: # Missing values
|
||||
# Missing values
|
||||
if gold.heads[i] is None or gold.labels[i] is None:
|
||||
gold.c.heads[i] = i
|
||||
gold.c.has_dep[i] = False
|
||||
else:
|
||||
|
@ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem):
|
|||
# Check projectivity --- leading cause
|
||||
if is_nonproj_tree(gold.heads):
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise the dependency "
|
||||
"parser.\n"
|
||||
"Likely cause: the tree is non-projective (i.e. it has crossing "
|
||||
"arcs -- see spacy/syntax/nonproj.pyx for definitions)\n"
|
||||
"The ArcEager transition system only supports projective trees.\n"
|
||||
"To learn non-projective representations, transform the data "
|
||||
"before training and after parsing. Either pass make_projective=True "
|
||||
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
|
||||
"Could not find a gold-standard action to supervise the "
|
||||
"dependency parser. Likely cause: the tree is "
|
||||
"non-projective (i.e. it has crossing arcs -- see "
|
||||
"spacy/syntax/nonproj.pyx for definitions). The ArcEager "
|
||||
"transition system only supports projective trees. To "
|
||||
"learn non-projective representations, transform the data "
|
||||
"before training and after parsing. Either pass "
|
||||
"make_projective=True to the GoldParse class, or use "
|
||||
"spacy.syntax.nonproj.preprocess_training_data.")
|
||||
else:
|
||||
print(gold.orig_annot)
|
||||
print(gold.words)
|
||||
|
@ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem):
|
|||
print(gold.labels)
|
||||
print(gold.sent_starts)
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise the dependency "
|
||||
"parser.\n"
|
||||
"The GoldParse was projective.\n"
|
||||
"The transition system has %d actions.\n"
|
||||
"State at failure:\n"
|
||||
"%s" % (self.n_moves, stcls.print_state(gold.words)))
|
||||
"Could not find a gold-standard action to supervise the"
|
||||
"dependency parser. The GoldParse was projective. The "
|
||||
"transition system has %d actions. State at failure: %s"
|
||||
% (self.n_moves, stcls.print_state(gold.words)))
|
||||
assert n_gold >= 1
|
||||
|
||||
def get_beam_annot(self, Beam beam):
|
||||
|
@ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem):
|
|||
deps[j].setdefault(dep, 0.0)
|
||||
deps[j][dep] += prob
|
||||
return heads, deps
|
||||
|
||||
|
|
|
@ -4,17 +4,12 @@ from __future__ import unicode_literals
|
|||
from thinc.typedefs cimport weight_t
|
||||
from thinc.extra.search cimport Beam
|
||||
from collections import OrderedDict
|
||||
import numpy
|
||||
from thinc.neural.ops import NumpyOps
|
||||
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from .transition_system cimport Transition
|
||||
from .transition_system cimport do_func_t
|
||||
from ..structs cimport TokenC, Entity
|
||||
from ..gold cimport GoldParseC
|
||||
from ..gold cimport GoldParse
|
||||
from ..attrs cimport ENT_TYPE, ENT_IOB
|
||||
from ..gold cimport GoldParseC, GoldParse
|
||||
|
||||
|
||||
cdef enum:
|
||||
|
@ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
|
||||
@classmethod
|
||||
def get_actions(cls, **kwargs):
|
||||
actions = kwargs.get('actions',
|
||||
OrderedDict((
|
||||
(MISSING, ['']),
|
||||
(BEGIN, []),
|
||||
(IN, []),
|
||||
(LAST, []),
|
||||
(UNIT, []),
|
||||
(OUT, [''])
|
||||
)))
|
||||
actions = kwargs.get('actions', OrderedDict((
|
||||
(MISSING, ['']),
|
||||
(BEGIN, []),
|
||||
(IN, []),
|
||||
(LAST, []),
|
||||
(UNIT, []),
|
||||
(OUT, [''])
|
||||
)))
|
||||
seen_entities = set()
|
||||
for entity_type in kwargs.get('entity_types', []):
|
||||
if entity_type in seen_entities:
|
||||
|
@ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
cdef attr_t label
|
||||
if name == '-' or name == None:
|
||||
if name == '-' or name is None:
|
||||
return Transition(clas=0, move=MISSING, label=0, score=0)
|
||||
elif name == '!O':
|
||||
return Transition(clas=0, move=ISNT, label=0, score=0)
|
||||
|
@ -328,8 +322,8 @@ cdef class In:
|
|||
return False
|
||||
elif preset_ent_iob == 3:
|
||||
return False
|
||||
# TODO: Is this quite right?
|
||||
# I think it's supposed to be ensuring the gazetteer matches are maintained
|
||||
# TODO: Is this quite right? I think it's supposed to be ensuring the
|
||||
# gazetteer matches are maintained
|
||||
elif st.B_(1).ent_iob != preset_ent_iob:
|
||||
return False
|
||||
# Don't allow entities to extend across sentence boundaries
|
||||
|
@ -354,10 +348,12 @@ cdef class In:
|
|||
if g_act == MISSING:
|
||||
return 0
|
||||
elif g_act == BEGIN:
|
||||
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
|
||||
# I, Gold B --> True
|
||||
# (P of bad open entity sunk, R of this entity sunk)
|
||||
return 0
|
||||
elif g_act == IN:
|
||||
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
|
||||
# I, Gold I --> True
|
||||
# (label forced by prev, if mismatch, P and R both sunk)
|
||||
return 0
|
||||
elif g_act == LAST:
|
||||
# I, Gold L --> True iff this entity sunk and next tag == O
|
||||
|
@ -505,11 +501,3 @@ cdef class Out:
|
|||
return 1
|
||||
else:
|
||||
return 1
|
||||
|
||||
|
||||
class OracleError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class UnknownMove(Exception):
|
||||
pass
|
||||
|
|
|
@ -5,71 +5,48 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from collections import Counter, OrderedDict
|
||||
from collections import OrderedDict
|
||||
import ujson
|
||||
import json
|
||||
import contextlib
|
||||
import numpy
|
||||
|
||||
from libc.math cimport exp
|
||||
cimport cython
|
||||
cimport cython.parallel
|
||||
import cytoolz
|
||||
import dill
|
||||
|
||||
import numpy.random
|
||||
cimport numpy as np
|
||||
|
||||
from libcpp.vector cimport vector
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cpython.ref cimport PyObject, Py_XDECREF
|
||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport malloc, calloc, free
|
||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
||||
from thinc.linalg cimport Vec, VecVec
|
||||
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
|
||||
from thinc.extra.eg cimport Example
|
||||
from libc.math cimport exp
|
||||
from libcpp.vector cimport vector
|
||||
from libc.string cimport memset
|
||||
from libc.stdlib cimport calloc, free
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t, class_t, hash_t
|
||||
from thinc.extra.search cimport Beam
|
||||
|
||||
from cymem.cymem cimport Pool, Address
|
||||
from murmurhash.mrmr cimport hash64
|
||||
from preshed.maps cimport MapStruct
|
||||
from preshed.maps cimport map_get
|
||||
|
||||
from thinc.api import layerize, chain, clone, with_flatten
|
||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
||||
from thinc.api import chain, clone
|
||||
from thinc.v2v import Model, Maxout, Affine
|
||||
from thinc.misc import LayerNorm
|
||||
|
||||
from thinc.neural.ops import NumpyOps, CupyOps
|
||||
from thinc.neural.ops import CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
|
||||
from .. import util
|
||||
from ..util import get_async, get_cuda_stream
|
||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
||||
from .._ml import Tok2Vec, doc2feats, rebatch
|
||||
from .._ml import Residual, flatten
|
||||
from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten
|
||||
from .._ml import link_vectors_to_models
|
||||
from ..compat import json_dumps, copy_array
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..gold cimport GoldParse
|
||||
from .. import util
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
from . import nonproj
|
||||
from .transition_system import OracleError
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..strings cimport StringStore
|
||||
from ..gold cimport GoldParse
|
||||
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
|
||||
from . import _beam_utils
|
||||
from .transition_system cimport Transition
|
||||
from . import _beam_utils, nonproj
|
||||
|
||||
|
||||
def get_templates(*args, **kwargs):
|
||||
return []
|
||||
|
||||
|
||||
DEBUG = False
|
||||
|
||||
|
||||
def set_debug(val):
|
||||
global DEBUG
|
||||
DEBUG = val
|
||||
|
@ -100,7 +77,8 @@ cdef class precompute_hiddens:
|
|||
cdef object _cuda_stream
|
||||
cdef object _bp_hiddens
|
||||
|
||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.):
|
||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
||||
drop=0.):
|
||||
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
|
||||
cdef np.ndarray cached
|
||||
if not isinstance(gpu_cached, numpy.ndarray):
|
||||
|
@ -120,8 +98,7 @@ cdef class precompute_hiddens:
|
|||
self._bp_hiddens = bp_features
|
||||
|
||||
cdef const float* get_feat_weights(self) except NULL:
|
||||
if not self._is_synchronized \
|
||||
and self._cuda_stream is not None:
|
||||
if not self._is_synchronized and self._cuda_stream is not None:
|
||||
self._cuda_stream.synchronize()
|
||||
self._is_synchronized = True
|
||||
return <float*>self._cached.data
|
||||
|
@ -130,7 +107,8 @@ cdef class precompute_hiddens:
|
|||
return self.begin_update(X)[0]
|
||||
|
||||
def begin_update(self, token_ids, drop=0.):
|
||||
cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
|
||||
cdef np.ndarray state_vector = numpy.zeros(
|
||||
(token_ids.shape[0], self.nO*self.nP), dtype='f')
|
||||
# This is tricky, but (assuming GPU available);
|
||||
# - Input to forward on CPU
|
||||
# - Output from forward on CPU
|
||||
|
@ -141,8 +119,8 @@ cdef class precompute_hiddens:
|
|||
feat_weights = self.get_feat_weights()
|
||||
cdef int[:, ::1] ids = token_ids
|
||||
sum_state_features(<float*>state_vector.data,
|
||||
feat_weights, &ids[0,0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
feat_weights, &ids[0, 0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||
|
||||
def backward(d_state_vector, sgd=None):
|
||||
|
@ -161,10 +139,11 @@ cdef class precompute_hiddens:
|
|||
state_vector = state_vector.reshape(
|
||||
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
|
||||
best, which = self.ops.maxout(state_vector)
|
||||
|
||||
def backprop(d_best, sgd=None):
|
||||
return self.ops.backprop_maxout(d_best, which, self.nP)
|
||||
return best, backprop
|
||||
|
||||
return best, backprop
|
||||
|
||||
|
||||
cdef void sum_state_features(float* output,
|
||||
|
@ -239,11 +218,15 @@ cdef class Parser:
|
|||
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
|
||||
if depth != 1:
|
||||
raise ValueError("Currently parser depth is hard-coded to 1.")
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
||||
cfg.get('maxout_pieces', 2))
|
||||
if parser_maxout_pieces != 2:
|
||||
raise ValueError("Currently parser_maxout_pieces is hard-coded to 2")
|
||||
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
|
||||
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
|
||||
raise ValueError("Currently parser_maxout_pieces is hard-coded "
|
||||
"to 2")
|
||||
token_vector_width = util.env_opt('token_vector_width',
|
||||
cfg.get('token_vector_width', 128))
|
||||
hidden_width = util.env_opt('hidden_width',
|
||||
cfg.get('hidden_width', 200))
|
||||
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
||||
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
||||
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
|
||||
|
@ -365,8 +348,8 @@ cdef class Parser:
|
|||
parse_states = self.parse_batch(subbatch)
|
||||
beams = []
|
||||
else:
|
||||
beams = self.beam_parse(subbatch,
|
||||
beam_width=beam_width, beam_density=beam_density)
|
||||
beams = self.beam_parse(subbatch, beam_width=beam_width,
|
||||
beam_density=beam_density)
|
||||
parse_states = []
|
||||
for beam in beams:
|
||||
parse_states.append(<StateClass>beam.at(0))
|
||||
|
@ -386,9 +369,9 @@ cdef class Parser:
|
|||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||
0.0)
|
||||
cuda_stream = util.get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||
docs, cuda_stream, 0.0)
|
||||
nr_state = len(docs)
|
||||
nr_class = self.moves.n_moves
|
||||
nr_dim = tokvecs.shape[1]
|
||||
|
@ -402,7 +385,8 @@ cdef class Parser:
|
|||
|
||||
feat_weights = state2vec.get_feat_weights()
|
||||
cdef int i
|
||||
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T)
|
||||
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(
|
||||
vec2scores._layers[-1].W.T)
|
||||
cdef np.ndarray hidden_bias = vec2scores._layers[-1].b
|
||||
|
||||
hW = <float*>hidden_weights.data
|
||||
|
@ -462,9 +446,9 @@ cdef class Parser:
|
|||
cdef Doc doc
|
||||
cdef int nr_class = self.moves.n_moves
|
||||
cdef StateClass stcls, output
|
||||
cuda_stream = get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||
0.0)
|
||||
cuda_stream = util.get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||
docs, cuda_stream, 0.0)
|
||||
beams = []
|
||||
cdef int offset = 0
|
||||
cdef int j = 0
|
||||
|
@ -519,9 +503,7 @@ cdef class Parser:
|
|||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||
docs = [docs]
|
||||
golds = [golds]
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
|
||||
cuda_stream = util.get_cuda_stream()
|
||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||
drop)
|
||||
|
@ -536,7 +518,6 @@ cdef class Parser:
|
|||
n_steps = 0
|
||||
while todo:
|
||||
states, golds = zip(*todo)
|
||||
|
||||
token_ids = self.get_token_ids(states)
|
||||
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
|
||||
if drop != 0:
|
||||
|
@ -558,8 +539,8 @@ cdef class Parser:
|
|||
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
||||
# Move token_ids and d_vector to GPU, asynchronously
|
||||
backprops.append((
|
||||
get_async(cuda_stream, token_ids),
|
||||
get_async(cuda_stream, d_vector),
|
||||
util.get_async(cuda_stream, token_ids),
|
||||
util.get_async(cuda_stream, d_vector),
|
||||
bp_vector
|
||||
))
|
||||
else:
|
||||
|
@ -592,15 +573,13 @@ cdef class Parser:
|
|||
states = self.moves.init_batch(docs)
|
||||
for gold in golds:
|
||||
self.moves.preprocess_gold(gold)
|
||||
|
||||
cuda_stream = get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
|
||||
|
||||
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
|
||||
states, golds,
|
||||
state2vec, vec2scores,
|
||||
width, density, self.cfg.get('hist_size', 0),
|
||||
drop=drop, losses=losses)
|
||||
cuda_stream = util.get_cuda_stream()
|
||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||
docs, cuda_stream, drop)
|
||||
states_d_scores, backprops = _beam_utils.update_beam(
|
||||
self.moves, self.nr_feature, 500, states, golds, state2vec,
|
||||
vec2scores, width, density, self.cfg.get('hist_size', 0),
|
||||
drop=drop, losses=losses)
|
||||
backprop_lower = []
|
||||
cdef float batch_size = len(docs)
|
||||
for i, d_scores in enumerate(states_d_scores):
|
||||
|
@ -612,13 +591,14 @@ cdef class Parser:
|
|||
if isinstance(self.model[0].ops, CupyOps) \
|
||||
and not isinstance(ids, state2vec.ops.xp.ndarray):
|
||||
backprop_lower.append((
|
||||
get_async(cuda_stream, ids),
|
||||
get_async(cuda_stream, d_vector),
|
||||
util.get_async(cuda_stream, ids),
|
||||
util.get_async(cuda_stream, d_vector),
|
||||
bp_vectors))
|
||||
else:
|
||||
backprop_lower.append((ids, d_vector, bp_vectors))
|
||||
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
||||
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
|
||||
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd,
|
||||
cuda_stream)
|
||||
|
||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||
"""Make a square batch, of length equal to the shortest doc. A long
|
||||
|
@ -768,7 +748,8 @@ cdef class Parser:
|
|||
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
||||
if 'model' in cfg:
|
||||
self.model = cfg['model']
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
|
||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples,
|
||||
label_freq_cutoff=100)
|
||||
actions = self.moves.get_actions(gold_parses=gold_tuples)
|
||||
for action, labels in actions.items():
|
||||
for label in labels:
|
||||
|
|
|
@ -1,39 +1,37 @@
|
|||
# coding: utf-8
|
||||
"""
|
||||
Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
||||
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
||||
for doing pseudo-projective parsing implementation uses the HEAD decoration
|
||||
scheme.
|
||||
"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from copy import copy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..attrs import DEP, HEAD
|
||||
|
||||
DELIMITER = '||'
|
||||
|
||||
|
||||
def ancestors(tokenid, heads):
|
||||
# returns all words going from the word up the path to the root
|
||||
# the path to root cannot be longer than the number of words in the sentence
|
||||
# this function ends after at most len(heads) steps
|
||||
# because it would otherwise loop indefinitely on cycles
|
||||
# Returns all words going from the word up the path to the root. The path
|
||||
# to root cannot be longer than the number of words in the sentence. This
|
||||
# function ends after at most len(heads) steps, because it would otherwise
|
||||
# loop indefinitely on cycles.
|
||||
head = tokenid
|
||||
cnt = 0
|
||||
while heads[head] != head and cnt < len(heads):
|
||||
head = heads[head]
|
||||
cnt += 1
|
||||
yield head
|
||||
if head == None:
|
||||
if head is None:
|
||||
break
|
||||
|
||||
|
||||
def contains_cycle(heads):
|
||||
# in an acyclic tree, the path from each word following
|
||||
# the head relation upwards always ends at the root node
|
||||
# in an acyclic tree, the path from each word following the head relation
|
||||
# upwards always ends at the root node
|
||||
for tokenid in range(len(heads)):
|
||||
seen = set([tokenid])
|
||||
for ancestor in ancestors(tokenid,heads):
|
||||
for ancestor in ancestors(tokenid, heads):
|
||||
if ancestor in seen:
|
||||
return seen
|
||||
seen.add(ancestor)
|
||||
|
@ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads):
|
|||
# if there is a token k, h < k < d such that h is not
|
||||
# an ancestor of k. Same for h -> d, h > d
|
||||
head = heads[tokenid]
|
||||
if head == tokenid: # root arcs cannot be non-projective
|
||||
if head == tokenid: # root arcs cannot be non-projective
|
||||
return False
|
||||
elif head == None: # unattached tokens cannot be non-projective
|
||||
elif head is None: # unattached tokens cannot be non-projective
|
||||
return False
|
||||
|
||||
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
|
||||
for k in range(start,end):
|
||||
for ancestor in ancestors(k,heads):
|
||||
if ancestor == None: # for unattached tokens/subtrees
|
||||
for k in range(start, end):
|
||||
for ancestor in ancestors(k, heads):
|
||||
if ancestor is None: # for unattached tokens/subtrees
|
||||
break
|
||||
elif ancestor == head: # normal case: k dominated by h
|
||||
elif ancestor == head: # normal case: k dominated by h
|
||||
break
|
||||
else: # head not in ancestors: d -> h is non-projective
|
||||
else: # head not in ancestors: d -> h is non-projective
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_nonproj_tree(heads):
|
||||
# a tree is non-projective if at least one arc is non-projective
|
||||
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
|
||||
return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
|
||||
|
||||
|
||||
def decompose(label):
|
||||
|
@ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
|
|||
for raw_text, sents in gold_tuples:
|
||||
prepro_sents = []
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
proj_heads,deco_labels = projectivize(heads,labels)
|
||||
proj_heads, deco_labels = projectivize(heads, labels)
|
||||
# set the label to ROOT for each root dependent
|
||||
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
|
||||
deco_labels = ['ROOT' if head == i else deco_labels[i]
|
||||
for i, head in enumerate(proj_heads)]
|
||||
# count label frequencies
|
||||
if label_freq_cutoff > 0:
|
||||
for label in deco_labels:
|
||||
if is_decorated(label):
|
||||
freqs[label] = freqs.get(label,0) + 1
|
||||
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
|
||||
freqs[label] = freqs.get(label, 0) + 1
|
||||
prepro_sents.append(
|
||||
((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
|
||||
preprocessed.append((raw_text, prepro_sents))
|
||||
|
||||
if label_freq_cutoff > 0:
|
||||
return _filter_labels(preprocessed,label_freq_cutoff,freqs)
|
||||
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
|
||||
return preprocessed
|
||||
|
||||
|
||||
def projectivize(heads, labels):
|
||||
# use the algorithm by Nivre & Nilsson 2005
|
||||
# assumes heads to be a proper tree, i.e. connected and cycle-free
|
||||
# returns a new pair (heads,labels) which encode
|
||||
# a projective and decorated tree
|
||||
# Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
|
||||
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
|
||||
# which encode a projective and decorated tree.
|
||||
proj_heads = copy(heads)
|
||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||
if smallest_np_arc == None: # this sentence is already projective
|
||||
if smallest_np_arc is None: # this sentence is already projective
|
||||
return proj_heads, copy(labels)
|
||||
while smallest_np_arc != None:
|
||||
while smallest_np_arc is not None:
|
||||
_lift(smallest_np_arc, proj_heads)
|
||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||
deco_labels = _decorate(heads, proj_heads, labels)
|
||||
|
@ -114,24 +112,26 @@ def projectivize(heads, labels):
|
|||
|
||||
|
||||
def deprojectivize(tokens):
|
||||
# reattach arcs with decorated labels (following HEAD scheme)
|
||||
# for each decorated arc X||Y, search top-down, left-to-right,
|
||||
# breadth-first until hitting a Y then make this the new head
|
||||
# Reattach arcs with decorated labels (following HEAD scheme). For each
|
||||
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
|
||||
# hitting a Y then make this the new head.
|
||||
for token in tokens:
|
||||
if is_decorated(token.dep_):
|
||||
newlabel,headlabel = decompose(token.dep_)
|
||||
newhead = _find_new_head(token,headlabel)
|
||||
newlabel, headlabel = decompose(token.dep_)
|
||||
newhead = _find_new_head(token, headlabel)
|
||||
token.head = newhead
|
||||
token.dep_ = newlabel
|
||||
return tokens
|
||||
|
||||
|
||||
def _decorate(heads, proj_heads, labels):
|
||||
# uses decoration scheme HEAD from Nivre & Nilsson 2005
|
||||
assert(len(heads) == len(proj_heads) == len(labels))
|
||||
deco_labels = []
|
||||
for tokenid,head in enumerate(heads):
|
||||
for tokenid, head in enumerate(heads):
|
||||
if head != proj_heads[tokenid]:
|
||||
deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
|
||||
deco_labels.append(
|
||||
'%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
|
||||
else:
|
||||
deco_labels.append(labels[tokenid])
|
||||
return deco_labels
|
||||
|
@ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads):
|
|||
# and ties are broken left to right
|
||||
smallest_size = float('inf')
|
||||
smallest_np_arc = None
|
||||
for tokenid,head in enumerate(heads):
|
||||
for tokenid, head in enumerate(heads):
|
||||
size = abs(tokenid-head)
|
||||
if size < smallest_size and is_nonproj_arc(tokenid,heads):
|
||||
if size < smallest_size and is_nonproj_arc(tokenid, heads):
|
||||
smallest_size = size
|
||||
smallest_np_arc = tokenid
|
||||
return smallest_np_arc
|
||||
|
@ -168,8 +168,10 @@ def _find_new_head(token, headlabel):
|
|||
next_queue = []
|
||||
for qtoken in queue:
|
||||
for child in qtoken.children:
|
||||
if child.is_space: continue
|
||||
if child == token: continue
|
||||
if child.is_space:
|
||||
continue
|
||||
if child == token:
|
||||
continue
|
||||
if child.dep_ == headlabel:
|
||||
return child
|
||||
next_queue.append(child)
|
||||
|
@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs):
|
|||
for raw_text, sents in gold_tuples:
|
||||
filtered_sents = []
|
||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||
filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
|
||||
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
|
||||
filtered_labels = [decompose(label)[0]
|
||||
if freqs.get(label, cutoff) < cutoff
|
||||
else label for label in labels]
|
||||
filtered_sents.append(
|
||||
((ids, words, tags, heads, filtered_labels, iob), ctnts))
|
||||
filtered.append((raw_text, filtered_sents))
|
||||
return filtered
|
||||
|
|
|
@ -2,17 +2,8 @@
|
|||
# cython: infer_types=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
import numpy
|
||||
|
||||
from ..vocab cimport EMPTY_LEXEME
|
||||
from ..structs cimport Entity
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..symbols cimport punct
|
||||
from ..attrs cimport IS_SPACE
|
||||
from ..attrs cimport attr_id_t
|
||||
from ..tokens.token cimport Token
|
||||
from ..tokens.doc cimport Doc
|
||||
|
||||
|
||||
|
|
|
@ -2,17 +2,17 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
||||
from cpython.ref cimport Py_INCREF
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t
|
||||
from collections import defaultdict, OrderedDict
|
||||
from collections import OrderedDict
|
||||
import ujson
|
||||
|
||||
from .. import util
|
||||
from ..structs cimport TokenC
|
||||
from .stateclass cimport StateClass
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
||||
from ..typedefs cimport attr_t
|
||||
from ..compat import json_dumps
|
||||
from .. import util
|
||||
|
||||
|
||||
cdef weight_t MIN_SCORE = -90000
|
||||
|
@ -136,11 +136,12 @@ cdef class TransitionSystem:
|
|||
print([gold.c.ner[i].clas for i in range(gold.length)])
|
||||
print([gold.c.ner[i].move for i in range(gold.length)])
|
||||
print([gold.c.ner[i].label for i in range(gold.length)])
|
||||
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
|
||||
print("Self labels",
|
||||
[self.c[i].label for i in range(self.n_moves)])
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise "
|
||||
"the entity recognizer\n"
|
||||
"The transition system has %d actions." % (self.n_moves))
|
||||
"the entity recognizer. The transition system has "
|
||||
"%d actions." % (self.n_moves))
|
||||
|
||||
def get_class_name(self, int clas):
|
||||
act = self.c[clas]
|
||||
|
@ -149,7 +150,7 @@ cdef class TransitionSystem:
|
|||
def add_action(self, int action, label_name):
|
||||
cdef attr_t label_id
|
||||
if not isinstance(label_name, int) and \
|
||||
not isinstance(label_name, long):
|
||||
not isinstance(label_name, long):
|
||||
label_id = self.strings.add(label_name)
|
||||
else:
|
||||
label_id = label_name
|
||||
|
@ -186,7 +187,7 @@ cdef class TransitionSystem:
|
|||
'name': self.move_name(trans.move, trans.label)
|
||||
})
|
||||
serializers = {
|
||||
'transitions': lambda: ujson.dumps(transitions),
|
||||
'transitions': lambda: json_dumps(transitions),
|
||||
'strings': lambda: self.strings.to_bytes()
|
||||
}
|
||||
return util.to_bytes(serializers, exclude)
|
||||
|
|
Loading…
Reference in New Issue
Block a user