mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Tidy up syntax
This commit is contained in:
parent
5167a0cce2
commit
b4d226a3f1
|
@ -2,7 +2,7 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
import numpy
|
import numpy
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_XDECREF
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
from thinc.extra.search import MaxViolation
|
from thinc.extra.search import MaxViolation
|
||||||
from thinc.typedefs cimport hash_t, class_t
|
from thinc.typedefs cimport hash_t, class_t
|
||||||
|
@ -11,7 +11,6 @@ from thinc.extra.search cimport MaxViolation
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..tokens.doc cimport Doc
|
|
||||||
|
|
||||||
|
|
||||||
# These are passed as callbacks to thinc.search.Beam
|
# These are passed as callbacks to thinc.search.Beam
|
||||||
|
@ -50,7 +49,7 @@ cdef class ParserBeam(object):
|
||||||
cdef public object dones
|
cdef public object dones
|
||||||
|
|
||||||
def __init__(self, TransitionSystem moves, states, golds,
|
def __init__(self, TransitionSystem moves, states, golds,
|
||||||
int width, float density):
|
int width, float density):
|
||||||
self.moves = moves
|
self.moves = moves
|
||||||
self.states = states
|
self.states = states
|
||||||
self.golds = golds
|
self.golds = golds
|
||||||
|
@ -59,7 +58,8 @@ cdef class ParserBeam(object):
|
||||||
cdef StateClass state, st
|
cdef StateClass state, st
|
||||||
for state in states:
|
for state in states:
|
||||||
beam = Beam(self.moves.n_moves, width, density)
|
beam = Beam(self.moves.n_moves, width, density)
|
||||||
beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent)
|
beam.initialize(self.moves.init_beam_state, state.c.length,
|
||||||
|
state.c._sent)
|
||||||
for i in range(beam.width):
|
for i in range(beam.width):
|
||||||
st = <StateClass>beam.at(i)
|
st = <StateClass>beam.at(i)
|
||||||
st.c.offset = state.c.offset
|
st.c.offset = state.c.offset
|
||||||
|
@ -74,7 +74,8 @@ cdef class ParserBeam(object):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_done(self):
|
def is_done(self):
|
||||||
return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams))
|
return all(b.is_done or self.dones[i]
|
||||||
|
for i, b in enumerate(self.beams))
|
||||||
|
|
||||||
def __getitem__(self, i):
|
def __getitem__(self, i):
|
||||||
return self.beams[i]
|
return self.beams[i]
|
||||||
|
@ -126,7 +127,8 @@ cdef class ParserBeam(object):
|
||||||
for i in range(beam.size):
|
for i in range(beam.size):
|
||||||
state = <StateClass>beam.at(i)
|
state = <StateClass>beam.at(i)
|
||||||
if not state.c.is_final():
|
if not state.c.is_final():
|
||||||
self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold)
|
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
|
||||||
|
state, gold)
|
||||||
if follow_gold:
|
if follow_gold:
|
||||||
for j in range(beam.nr_class):
|
for j in range(beam.nr_class):
|
||||||
if beam.costs[i][j] >= 1:
|
if beam.costs[i][j] >= 1:
|
||||||
|
@ -146,7 +148,10 @@ def get_token_ids(states, int n_tokens):
|
||||||
c_ids += ids.shape[1]
|
c_ids += ids.shape[1]
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
|
|
||||||
nr_update = 0
|
nr_update = 0
|
||||||
|
|
||||||
|
|
||||||
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||||
states, golds,
|
states, golds,
|
||||||
state2vec, vec2scores,
|
state2vec, vec2scores,
|
||||||
|
@ -167,23 +172,27 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||||
if pbeam.is_done and gbeam.is_done:
|
if pbeam.is_done and gbeam.is_done:
|
||||||
break
|
break
|
||||||
# The beam maps let us find the right row in the flattened scores
|
# The beam maps let us find the right row in the flattened scores
|
||||||
# arrays for each state. States are identified by (example id, history).
|
# arrays for each state. States are identified by (example id,
|
||||||
# We keep a different beam map for each step (since we'll have a flat
|
# history). We keep a different beam map for each step (since we'll
|
||||||
# scores array for each step). The beam map will let us take the per-state
|
# have a flat scores array for each step). The beam map will let us
|
||||||
# losses, and compute the gradient for each (step, state, class).
|
# take the per-state losses, and compute the gradient for each (step,
|
||||||
|
# state, class).
|
||||||
beam_maps.append({})
|
beam_maps.append({})
|
||||||
# Gather all states from the two beams in a list. Some stats may occur
|
# Gather all states from the two beams in a list. Some stats may occur
|
||||||
# in both beams. To figure out which beam each state belonged to,
|
# in both beams. To figure out which beam each state belonged to,
|
||||||
# we keep two lists of indices, p_indices and g_indices
|
# we keep two lists of indices, p_indices and g_indices
|
||||||
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update)
|
states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1],
|
||||||
|
nr_update)
|
||||||
if not states:
|
if not states:
|
||||||
break
|
break
|
||||||
# Now that we have our flat list of states, feed them through the model
|
# Now that we have our flat list of states, feed them through the model
|
||||||
token_ids = get_token_ids(states, nr_feature)
|
token_ids = get_token_ids(states, nr_feature)
|
||||||
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
||||||
if hist_feats:
|
if hist_feats:
|
||||||
hists = numpy.asarray([st.history[:hist_feats] for st in states], dtype='i')
|
hists = numpy.asarray([st.history[:hist_feats] for st in states],
|
||||||
scores, bp_scores = vec2scores.begin_update((vectors, hists), drop=drop)
|
dtype='i')
|
||||||
|
scores, bp_scores = vec2scores.begin_update((vectors, hists),
|
||||||
|
drop=drop)
|
||||||
else:
|
else:
|
||||||
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
||||||
|
|
||||||
|
@ -192,8 +201,10 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||||
|
|
||||||
# Unpack the flat scores into lists for the two beams. The indices arrays
|
# Unpack the flat scores into lists for the two beams. The indices arrays
|
||||||
# tell us which example and state the scores-row refers to.
|
# tell us which example and state the scores-row refers to.
|
||||||
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices]
|
p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
|
||||||
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices]
|
for indices in p_indices]
|
||||||
|
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
|
||||||
|
for indices in g_indices]
|
||||||
# Now advance the states in the beams. The gold beam is contrained to
|
# Now advance the states in the beams. The gold beam is contrained to
|
||||||
# to follow only gold analyses.
|
# to follow only gold analyses.
|
||||||
pbeam.advance(p_scores)
|
pbeam.advance(p_scores)
|
||||||
|
@ -249,8 +260,7 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
|
||||||
|
|
||||||
|
|
||||||
def get_gradient(nr_class, beam_maps, histories, losses):
|
def get_gradient(nr_class, beam_maps, histories, losses):
|
||||||
"""
|
"""The global model assigns a loss to each parse. The beam scores
|
||||||
The global model assigns a loss to each parse. The beam scores
|
|
||||||
are additive, so the same gradient is applied to each action
|
are additive, so the same gradient is applied to each action
|
||||||
in the history. This gives the gradient of a single *action*
|
in the history. This gives the gradient of a single *action*
|
||||||
for a beam state -- so we have "the gradient of loss for taking
|
for a beam state -- so we have "the gradient of loss for taking
|
||||||
|
@ -270,7 +280,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
||||||
if loss != 0.0 and not numpy.isnan(loss):
|
if loss != 0.0 and not numpy.isnan(loss):
|
||||||
nr_step = max(nr_step, len(hist))
|
nr_step = max(nr_step, len(hist))
|
||||||
for i in range(nr_step):
|
for i in range(nr_step):
|
||||||
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f'))
|
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
|
||||||
|
dtype='f'))
|
||||||
assert len(histories) == len(losses)
|
assert len(histories) == len(losses)
|
||||||
for eg_id, hists in enumerate(histories):
|
for eg_id, hists in enumerate(histories):
|
||||||
for loss, hist in zip(losses[eg_id], hists):
|
for loss, hist in zip(losses[eg_id], hists):
|
||||||
|
@ -287,5 +298,3 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
||||||
grads[j][i, clas] += loss
|
grads[j][i, clas] += loss
|
||||||
key = key + tuple([clas])
|
key = key + tuple([clas])
|
||||||
return grads
|
return grads
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
# test
|
|
|
@ -4,24 +4,16 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport Py_INCREF
|
||||||
import ctypes
|
|
||||||
from libc.stdint cimport uint32_t
|
|
||||||
from libc.string cimport memcpy
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
import numpy
|
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC, is_space_token
|
from ._state cimport StateC
|
||||||
from .nonproj import is_nonproj_tree
|
from .nonproj import is_nonproj_tree
|
||||||
from .transition_system cimport do_func_t, get_cost_func_t
|
|
||||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse, GoldParseC
|
||||||
from ..gold cimport GoldParseC
|
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE, IS_PUNCT
|
|
||||||
from ..lexeme cimport Lexeme
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
|
|
||||||
|
|
||||||
|
@ -316,14 +308,13 @@ cdef class ArcEager(TransitionSystem):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_actions(cls, **kwargs):
|
def get_actions(cls, **kwargs):
|
||||||
actions = kwargs.get('actions',
|
actions = kwargs.get('actions', OrderedDict((
|
||||||
OrderedDict((
|
(SHIFT, ['']),
|
||||||
(SHIFT, ['']),
|
(REDUCE, ['']),
|
||||||
(REDUCE, ['']),
|
(RIGHT, []),
|
||||||
(RIGHT, []),
|
(LEFT, []),
|
||||||
(LEFT, []),
|
(BREAK, ['ROOT']))
|
||||||
(BREAK, ['ROOT'])
|
))
|
||||||
)))
|
|
||||||
seen_actions = set()
|
seen_actions = set()
|
||||||
for label in kwargs.get('left_labels', []):
|
for label in kwargs.get('left_labels', []):
|
||||||
if label.upper() != 'ROOT':
|
if label.upper() != 'ROOT':
|
||||||
|
@ -363,7 +354,8 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if gold.cand_to_gold[i] is None:
|
if gold.cand_to_gold[i] is None:
|
||||||
continue
|
continue
|
||||||
if state.safe_get(i).dep:
|
if state.safe_get(i).dep:
|
||||||
predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep]))
|
predicted.add((i, state.H(i),
|
||||||
|
self.strings[state.safe_get(i).dep]))
|
||||||
else:
|
else:
|
||||||
predicted.add((i, state.H(i), 'ROOT'))
|
predicted.add((i, state.H(i), 'ROOT'))
|
||||||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
||||||
|
@ -381,7 +373,8 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if not self.has_gold(gold):
|
if not self.has_gold(gold):
|
||||||
return None
|
return None
|
||||||
for i in range(gold.length):
|
for i in range(gold.length):
|
||||||
if gold.heads[i] is None or gold.labels[i] is None: # Missing values
|
# Missing values
|
||||||
|
if gold.heads[i] is None or gold.labels[i] is None:
|
||||||
gold.c.heads[i] = i
|
gold.c.heads[i] = i
|
||||||
gold.c.has_dep[i] = False
|
gold.c.has_dep[i] = False
|
||||||
else:
|
else:
|
||||||
|
@ -517,14 +510,15 @@ cdef class ArcEager(TransitionSystem):
|
||||||
# Check projectivity --- leading cause
|
# Check projectivity --- leading cause
|
||||||
if is_nonproj_tree(gold.heads):
|
if is_nonproj_tree(gold.heads):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Could not find a gold-standard action to supervise the dependency "
|
"Could not find a gold-standard action to supervise the "
|
||||||
"parser.\n"
|
"dependency parser. Likely cause: the tree is "
|
||||||
"Likely cause: the tree is non-projective (i.e. it has crossing "
|
"non-projective (i.e. it has crossing arcs -- see "
|
||||||
"arcs -- see spacy/syntax/nonproj.pyx for definitions)\n"
|
"spacy/syntax/nonproj.pyx for definitions). The ArcEager "
|
||||||
"The ArcEager transition system only supports projective trees.\n"
|
"transition system only supports projective trees. To "
|
||||||
"To learn non-projective representations, transform the data "
|
"learn non-projective representations, transform the data "
|
||||||
"before training and after parsing. Either pass make_projective=True "
|
"before training and after parsing. Either pass "
|
||||||
"to the GoldParse class, or use PseudoProjectivity.preprocess_training_data")
|
"make_projective=True to the GoldParse class, or use "
|
||||||
|
"spacy.syntax.nonproj.preprocess_training_data.")
|
||||||
else:
|
else:
|
||||||
print(gold.orig_annot)
|
print(gold.orig_annot)
|
||||||
print(gold.words)
|
print(gold.words)
|
||||||
|
@ -532,12 +526,10 @@ cdef class ArcEager(TransitionSystem):
|
||||||
print(gold.labels)
|
print(gold.labels)
|
||||||
print(gold.sent_starts)
|
print(gold.sent_starts)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Could not find a gold-standard action to supervise the dependency "
|
"Could not find a gold-standard action to supervise the"
|
||||||
"parser.\n"
|
"dependency parser. The GoldParse was projective. The "
|
||||||
"The GoldParse was projective.\n"
|
"transition system has %d actions. State at failure: %s"
|
||||||
"The transition system has %d actions.\n"
|
% (self.n_moves, stcls.print_state(gold.words)))
|
||||||
"State at failure:\n"
|
|
||||||
"%s" % (self.n_moves, stcls.print_state(gold.words)))
|
|
||||||
assert n_gold >= 1
|
assert n_gold >= 1
|
||||||
|
|
||||||
def get_beam_annot(self, Beam beam):
|
def get_beam_annot(self, Beam beam):
|
||||||
|
@ -558,4 +550,3 @@ cdef class ArcEager(TransitionSystem):
|
||||||
deps[j].setdefault(dep, 0.0)
|
deps[j].setdefault(dep, 0.0)
|
||||||
deps[j][dep] += prob
|
deps[j][dep] += prob
|
||||||
return heads, deps
|
return heads, deps
|
||||||
|
|
||||||
|
|
|
@ -4,17 +4,12 @@ from __future__ import unicode_literals
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import numpy
|
|
||||||
from thinc.neural.ops import NumpyOps
|
|
||||||
|
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
from .transition_system cimport Transition
|
from .transition_system cimport Transition
|
||||||
from .transition_system cimport do_func_t
|
from .transition_system cimport do_func_t
|
||||||
from ..structs cimport TokenC, Entity
|
from ..gold cimport GoldParseC, GoldParse
|
||||||
from ..gold cimport GoldParseC
|
|
||||||
from ..gold cimport GoldParse
|
|
||||||
from ..attrs cimport ENT_TYPE, ENT_IOB
|
|
||||||
|
|
||||||
|
|
||||||
cdef enum:
|
cdef enum:
|
||||||
|
@ -69,15 +64,14 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_actions(cls, **kwargs):
|
def get_actions(cls, **kwargs):
|
||||||
actions = kwargs.get('actions',
|
actions = kwargs.get('actions', OrderedDict((
|
||||||
OrderedDict((
|
(MISSING, ['']),
|
||||||
(MISSING, ['']),
|
(BEGIN, []),
|
||||||
(BEGIN, []),
|
(IN, []),
|
||||||
(IN, []),
|
(LAST, []),
|
||||||
(LAST, []),
|
(UNIT, []),
|
||||||
(UNIT, []),
|
(OUT, [''])
|
||||||
(OUT, [''])
|
)))
|
||||||
)))
|
|
||||||
seen_entities = set()
|
seen_entities = set()
|
||||||
for entity_type in kwargs.get('entity_types', []):
|
for entity_type in kwargs.get('entity_types', []):
|
||||||
if entity_type in seen_entities:
|
if entity_type in seen_entities:
|
||||||
|
@ -160,7 +154,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
cdef attr_t label
|
cdef attr_t label
|
||||||
if name == '-' or name == None:
|
if name == '-' or name is None:
|
||||||
return Transition(clas=0, move=MISSING, label=0, score=0)
|
return Transition(clas=0, move=MISSING, label=0, score=0)
|
||||||
elif name == '!O':
|
elif name == '!O':
|
||||||
return Transition(clas=0, move=ISNT, label=0, score=0)
|
return Transition(clas=0, move=ISNT, label=0, score=0)
|
||||||
|
@ -328,8 +322,8 @@ cdef class In:
|
||||||
return False
|
return False
|
||||||
elif preset_ent_iob == 3:
|
elif preset_ent_iob == 3:
|
||||||
return False
|
return False
|
||||||
# TODO: Is this quite right?
|
# TODO: Is this quite right? I think it's supposed to be ensuring the
|
||||||
# I think it's supposed to be ensuring the gazetteer matches are maintained
|
# gazetteer matches are maintained
|
||||||
elif st.B_(1).ent_iob != preset_ent_iob:
|
elif st.B_(1).ent_iob != preset_ent_iob:
|
||||||
return False
|
return False
|
||||||
# Don't allow entities to extend across sentence boundaries
|
# Don't allow entities to extend across sentence boundaries
|
||||||
|
@ -354,10 +348,12 @@ cdef class In:
|
||||||
if g_act == MISSING:
|
if g_act == MISSING:
|
||||||
return 0
|
return 0
|
||||||
elif g_act == BEGIN:
|
elif g_act == BEGIN:
|
||||||
# I, Gold B --> True (P of bad open entity sunk, R of this entity sunk)
|
# I, Gold B --> True
|
||||||
|
# (P of bad open entity sunk, R of this entity sunk)
|
||||||
return 0
|
return 0
|
||||||
elif g_act == IN:
|
elif g_act == IN:
|
||||||
# I, Gold I --> True (label forced by prev, if mismatch, P and R both sunk)
|
# I, Gold I --> True
|
||||||
|
# (label forced by prev, if mismatch, P and R both sunk)
|
||||||
return 0
|
return 0
|
||||||
elif g_act == LAST:
|
elif g_act == LAST:
|
||||||
# I, Gold L --> True iff this entity sunk and next tag == O
|
# I, Gold L --> True iff this entity sunk and next tag == O
|
||||||
|
@ -505,11 +501,3 @@ cdef class Out:
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
class OracleError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class UnknownMove(Exception):
|
|
||||||
pass
|
|
||||||
|
|
|
@ -5,71 +5,48 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from collections import Counter, OrderedDict
|
from collections import OrderedDict
|
||||||
import ujson
|
import ujson
|
||||||
import json
|
import json
|
||||||
import contextlib
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from libc.math cimport exp
|
|
||||||
cimport cython
|
|
||||||
cimport cython.parallel
|
cimport cython.parallel
|
||||||
import cytoolz
|
import cytoolz
|
||||||
import dill
|
|
||||||
|
|
||||||
import numpy.random
|
import numpy.random
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
from cpython.ref cimport PyObject, Py_XDECREF
|
||||||
from libcpp.vector cimport vector
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
from libc.math cimport exp
|
||||||
from libc.string cimport memset, memcpy
|
from libcpp.vector cimport vector
|
||||||
from libc.stdlib cimport malloc, calloc, free
|
from libc.string cimport memset
|
||||||
from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
|
from libc.stdlib cimport calloc, free
|
||||||
from thinc.linear.avgtron cimport AveragedPerceptron
|
from cymem.cymem cimport Pool
|
||||||
from thinc.linalg cimport Vec, VecVec
|
from thinc.typedefs cimport weight_t, class_t, hash_t
|
||||||
from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
|
|
||||||
from thinc.extra.eg cimport Example
|
|
||||||
from thinc.extra.search cimport Beam
|
from thinc.extra.search cimport Beam
|
||||||
|
from thinc.api import chain, clone
|
||||||
from cymem.cymem cimport Pool, Address
|
from thinc.v2v import Model, Maxout, Affine
|
||||||
from murmurhash.mrmr cimport hash64
|
|
||||||
from preshed.maps cimport MapStruct
|
|
||||||
from preshed.maps cimport map_get
|
|
||||||
|
|
||||||
from thinc.api import layerize, chain, clone, with_flatten
|
|
||||||
from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu, SELU
|
|
||||||
from thinc.misc import LayerNorm
|
from thinc.misc import LayerNorm
|
||||||
|
from thinc.neural.ops import CupyOps
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
|
||||||
from .. import util
|
from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten
|
||||||
from ..util import get_async, get_cuda_stream
|
|
||||||
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
|
|
||||||
from .._ml import Tok2Vec, doc2feats, rebatch
|
|
||||||
from .._ml import Residual, flatten
|
|
||||||
from .._ml import link_vectors_to_models
|
from .._ml import link_vectors_to_models
|
||||||
from ..compat import json_dumps, copy_array
|
from ..compat import json_dumps, copy_array
|
||||||
|
from ..tokens.doc cimport Doc
|
||||||
|
from ..gold cimport GoldParse
|
||||||
|
from .. import util
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC
|
||||||
from . import nonproj
|
from .transition_system cimport Transition
|
||||||
from .transition_system import OracleError
|
from . import _beam_utils, nonproj
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
|
||||||
from ..structs cimport TokenC
|
|
||||||
from ..tokens.doc cimport Doc
|
|
||||||
from ..strings cimport StringStore
|
|
||||||
from ..gold cimport GoldParse
|
|
||||||
from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG
|
|
||||||
from . import _beam_utils
|
|
||||||
|
|
||||||
|
|
||||||
def get_templates(*args, **kwargs):
|
def get_templates(*args, **kwargs):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
|
|
||||||
|
|
||||||
def set_debug(val):
|
def set_debug(val):
|
||||||
global DEBUG
|
global DEBUG
|
||||||
DEBUG = val
|
DEBUG = val
|
||||||
|
@ -100,7 +77,8 @@ cdef class precompute_hiddens:
|
||||||
cdef object _cuda_stream
|
cdef object _cuda_stream
|
||||||
cdef object _bp_hiddens
|
cdef object _bp_hiddens
|
||||||
|
|
||||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None, drop=0.):
|
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
||||||
|
drop=0.):
|
||||||
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
|
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
|
||||||
cdef np.ndarray cached
|
cdef np.ndarray cached
|
||||||
if not isinstance(gpu_cached, numpy.ndarray):
|
if not isinstance(gpu_cached, numpy.ndarray):
|
||||||
|
@ -120,8 +98,7 @@ cdef class precompute_hiddens:
|
||||||
self._bp_hiddens = bp_features
|
self._bp_hiddens = bp_features
|
||||||
|
|
||||||
cdef const float* get_feat_weights(self) except NULL:
|
cdef const float* get_feat_weights(self) except NULL:
|
||||||
if not self._is_synchronized \
|
if not self._is_synchronized and self._cuda_stream is not None:
|
||||||
and self._cuda_stream is not None:
|
|
||||||
self._cuda_stream.synchronize()
|
self._cuda_stream.synchronize()
|
||||||
self._is_synchronized = True
|
self._is_synchronized = True
|
||||||
return <float*>self._cached.data
|
return <float*>self._cached.data
|
||||||
|
@ -130,7 +107,8 @@ cdef class precompute_hiddens:
|
||||||
return self.begin_update(X)[0]
|
return self.begin_update(X)[0]
|
||||||
|
|
||||||
def begin_update(self, token_ids, drop=0.):
|
def begin_update(self, token_ids, drop=0.):
|
||||||
cdef np.ndarray state_vector = numpy.zeros((token_ids.shape[0], self.nO*self.nP), dtype='f')
|
cdef np.ndarray state_vector = numpy.zeros(
|
||||||
|
(token_ids.shape[0], self.nO*self.nP), dtype='f')
|
||||||
# This is tricky, but (assuming GPU available);
|
# This is tricky, but (assuming GPU available);
|
||||||
# - Input to forward on CPU
|
# - Input to forward on CPU
|
||||||
# - Output from forward on CPU
|
# - Output from forward on CPU
|
||||||
|
@ -141,8 +119,8 @@ cdef class precompute_hiddens:
|
||||||
feat_weights = self.get_feat_weights()
|
feat_weights = self.get_feat_weights()
|
||||||
cdef int[:, ::1] ids = token_ids
|
cdef int[:, ::1] ids = token_ids
|
||||||
sum_state_features(<float*>state_vector.data,
|
sum_state_features(<float*>state_vector.data,
|
||||||
feat_weights, &ids[0,0],
|
feat_weights, &ids[0, 0],
|
||||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||||
|
|
||||||
def backward(d_state_vector, sgd=None):
|
def backward(d_state_vector, sgd=None):
|
||||||
|
@ -161,10 +139,11 @@ cdef class precompute_hiddens:
|
||||||
state_vector = state_vector.reshape(
|
state_vector = state_vector.reshape(
|
||||||
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
|
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
|
||||||
best, which = self.ops.maxout(state_vector)
|
best, which = self.ops.maxout(state_vector)
|
||||||
|
|
||||||
def backprop(d_best, sgd=None):
|
def backprop(d_best, sgd=None):
|
||||||
return self.ops.backprop_maxout(d_best, which, self.nP)
|
return self.ops.backprop_maxout(d_best, which, self.nP)
|
||||||
return best, backprop
|
|
||||||
|
|
||||||
|
return best, backprop
|
||||||
|
|
||||||
|
|
||||||
cdef void sum_state_features(float* output,
|
cdef void sum_state_features(float* output,
|
||||||
|
@ -239,11 +218,15 @@ cdef class Parser:
|
||||||
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
|
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
|
||||||
if depth != 1:
|
if depth != 1:
|
||||||
raise ValueError("Currently parser depth is hard-coded to 1.")
|
raise ValueError("Currently parser depth is hard-coded to 1.")
|
||||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', cfg.get('maxout_pieces', 2))
|
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
||||||
|
cfg.get('maxout_pieces', 2))
|
||||||
if parser_maxout_pieces != 2:
|
if parser_maxout_pieces != 2:
|
||||||
raise ValueError("Currently parser_maxout_pieces is hard-coded to 2")
|
raise ValueError("Currently parser_maxout_pieces is hard-coded "
|
||||||
token_vector_width = util.env_opt('token_vector_width', cfg.get('token_vector_width', 128))
|
"to 2")
|
||||||
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
|
token_vector_width = util.env_opt('token_vector_width',
|
||||||
|
cfg.get('token_vector_width', 128))
|
||||||
|
hidden_width = util.env_opt('hidden_width',
|
||||||
|
cfg.get('hidden_width', 200))
|
||||||
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
||||||
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
||||||
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
|
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
|
||||||
|
@ -365,8 +348,8 @@ cdef class Parser:
|
||||||
parse_states = self.parse_batch(subbatch)
|
parse_states = self.parse_batch(subbatch)
|
||||||
beams = []
|
beams = []
|
||||||
else:
|
else:
|
||||||
beams = self.beam_parse(subbatch,
|
beams = self.beam_parse(subbatch, beam_width=beam_width,
|
||||||
beam_width=beam_width, beam_density=beam_density)
|
beam_density=beam_density)
|
||||||
parse_states = []
|
parse_states = []
|
||||||
for beam in beams:
|
for beam in beams:
|
||||||
parse_states.append(<StateClass>beam.at(0))
|
parse_states.append(<StateClass>beam.at(0))
|
||||||
|
@ -386,9 +369,9 @@ cdef class Parser:
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = util.get_cuda_stream()
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||||
0.0)
|
docs, cuda_stream, 0.0)
|
||||||
nr_state = len(docs)
|
nr_state = len(docs)
|
||||||
nr_class = self.moves.n_moves
|
nr_class = self.moves.n_moves
|
||||||
nr_dim = tokvecs.shape[1]
|
nr_dim = tokvecs.shape[1]
|
||||||
|
@ -402,7 +385,8 @@ cdef class Parser:
|
||||||
|
|
||||||
feat_weights = state2vec.get_feat_weights()
|
feat_weights = state2vec.get_feat_weights()
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(vec2scores._layers[-1].W.T)
|
cdef np.ndarray hidden_weights = numpy.ascontiguousarray(
|
||||||
|
vec2scores._layers[-1].W.T)
|
||||||
cdef np.ndarray hidden_bias = vec2scores._layers[-1].b
|
cdef np.ndarray hidden_bias = vec2scores._layers[-1].b
|
||||||
|
|
||||||
hW = <float*>hidden_weights.data
|
hW = <float*>hidden_weights.data
|
||||||
|
@ -462,9 +446,9 @@ cdef class Parser:
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef int nr_class = self.moves.n_moves
|
cdef int nr_class = self.moves.n_moves
|
||||||
cdef StateClass stcls, output
|
cdef StateClass stcls, output
|
||||||
cuda_stream = get_cuda_stream()
|
cuda_stream = util.get_cuda_stream()
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||||
0.0)
|
docs, cuda_stream, 0.0)
|
||||||
beams = []
|
beams = []
|
||||||
cdef int offset = 0
|
cdef int offset = 0
|
||||||
cdef int j = 0
|
cdef int j = 0
|
||||||
|
@ -519,9 +503,7 @@ cdef class Parser:
|
||||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
|
cuda_stream = util.get_cuda_stream()
|
||||||
cuda_stream = get_cuda_stream()
|
|
||||||
|
|
||||||
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
states, golds, max_steps = self._init_gold_batch(docs, golds)
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
|
||||||
drop)
|
drop)
|
||||||
|
@ -536,7 +518,6 @@ cdef class Parser:
|
||||||
n_steps = 0
|
n_steps = 0
|
||||||
while todo:
|
while todo:
|
||||||
states, golds = zip(*todo)
|
states, golds = zip(*todo)
|
||||||
|
|
||||||
token_ids = self.get_token_ids(states)
|
token_ids = self.get_token_ids(states)
|
||||||
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
|
vector, bp_vector = state2vec.begin_update(token_ids, drop=0.0)
|
||||||
if drop != 0:
|
if drop != 0:
|
||||||
|
@ -558,8 +539,8 @@ cdef class Parser:
|
||||||
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
and not isinstance(token_ids, state2vec.ops.xp.ndarray):
|
||||||
# Move token_ids and d_vector to GPU, asynchronously
|
# Move token_ids and d_vector to GPU, asynchronously
|
||||||
backprops.append((
|
backprops.append((
|
||||||
get_async(cuda_stream, token_ids),
|
util.get_async(cuda_stream, token_ids),
|
||||||
get_async(cuda_stream, d_vector),
|
util.get_async(cuda_stream, d_vector),
|
||||||
bp_vector
|
bp_vector
|
||||||
))
|
))
|
||||||
else:
|
else:
|
||||||
|
@ -592,15 +573,13 @@ cdef class Parser:
|
||||||
states = self.moves.init_batch(docs)
|
states = self.moves.init_batch(docs)
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
self.moves.preprocess_gold(gold)
|
self.moves.preprocess_gold(gold)
|
||||||
|
cuda_stream = util.get_cuda_stream()
|
||||||
cuda_stream = get_cuda_stream()
|
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(
|
||||||
(tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, drop)
|
docs, cuda_stream, drop)
|
||||||
|
states_d_scores, backprops = _beam_utils.update_beam(
|
||||||
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
|
self.moves, self.nr_feature, 500, states, golds, state2vec,
|
||||||
states, golds,
|
vec2scores, width, density, self.cfg.get('hist_size', 0),
|
||||||
state2vec, vec2scores,
|
drop=drop, losses=losses)
|
||||||
width, density, self.cfg.get('hist_size', 0),
|
|
||||||
drop=drop, losses=losses)
|
|
||||||
backprop_lower = []
|
backprop_lower = []
|
||||||
cdef float batch_size = len(docs)
|
cdef float batch_size = len(docs)
|
||||||
for i, d_scores in enumerate(states_d_scores):
|
for i, d_scores in enumerate(states_d_scores):
|
||||||
|
@ -612,13 +591,14 @@ cdef class Parser:
|
||||||
if isinstance(self.model[0].ops, CupyOps) \
|
if isinstance(self.model[0].ops, CupyOps) \
|
||||||
and not isinstance(ids, state2vec.ops.xp.ndarray):
|
and not isinstance(ids, state2vec.ops.xp.ndarray):
|
||||||
backprop_lower.append((
|
backprop_lower.append((
|
||||||
get_async(cuda_stream, ids),
|
util.get_async(cuda_stream, ids),
|
||||||
get_async(cuda_stream, d_vector),
|
util.get_async(cuda_stream, d_vector),
|
||||||
bp_vectors))
|
bp_vectors))
|
||||||
else:
|
else:
|
||||||
backprop_lower.append((ids, d_vector, bp_vectors))
|
backprop_lower.append((ids, d_vector, bp_vectors))
|
||||||
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
|
||||||
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
|
self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd,
|
||||||
|
cuda_stream)
|
||||||
|
|
||||||
def _init_gold_batch(self, whole_docs, whole_golds):
|
def _init_gold_batch(self, whole_docs, whole_golds):
|
||||||
"""Make a square batch, of length equal to the shortest doc. A long
|
"""Make a square batch, of length equal to the shortest doc. A long
|
||||||
|
@ -768,7 +748,8 @@ cdef class Parser:
|
||||||
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
def begin_training(self, gold_tuples, pipeline=None, **cfg):
|
||||||
if 'model' in cfg:
|
if 'model' in cfg:
|
||||||
self.model = cfg['model']
|
self.model = cfg['model']
|
||||||
gold_tuples = nonproj.preprocess_training_data(gold_tuples, label_freq_cutoff=100)
|
gold_tuples = nonproj.preprocess_training_data(gold_tuples,
|
||||||
|
label_freq_cutoff=100)
|
||||||
actions = self.moves.get_actions(gold_parses=gold_tuples)
|
actions = self.moves.get_actions(gold_parses=gold_tuples)
|
||||||
for action, labels in actions.items():
|
for action, labels in actions.items():
|
||||||
for label in labels:
|
for label in labels:
|
||||||
|
|
|
@ -1,39 +1,37 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
"""
|
"""Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
||||||
Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
|
|
||||||
for doing pseudo-projective parsing implementation uses the HEAD decoration
|
for doing pseudo-projective parsing implementation uses the HEAD decoration
|
||||||
scheme.
|
scheme.
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
|
||||||
from ..attrs import DEP, HEAD
|
|
||||||
|
|
||||||
DELIMITER = '||'
|
DELIMITER = '||'
|
||||||
|
|
||||||
|
|
||||||
def ancestors(tokenid, heads):
|
def ancestors(tokenid, heads):
|
||||||
# returns all words going from the word up the path to the root
|
# Returns all words going from the word up the path to the root. The path
|
||||||
# the path to root cannot be longer than the number of words in the sentence
|
# to root cannot be longer than the number of words in the sentence. This
|
||||||
# this function ends after at most len(heads) steps
|
# function ends after at most len(heads) steps, because it would otherwise
|
||||||
# because it would otherwise loop indefinitely on cycles
|
# loop indefinitely on cycles.
|
||||||
head = tokenid
|
head = tokenid
|
||||||
cnt = 0
|
cnt = 0
|
||||||
while heads[head] != head and cnt < len(heads):
|
while heads[head] != head and cnt < len(heads):
|
||||||
head = heads[head]
|
head = heads[head]
|
||||||
cnt += 1
|
cnt += 1
|
||||||
yield head
|
yield head
|
||||||
if head == None:
|
if head is None:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def contains_cycle(heads):
|
def contains_cycle(heads):
|
||||||
# in an acyclic tree, the path from each word following
|
# in an acyclic tree, the path from each word following the head relation
|
||||||
# the head relation upwards always ends at the root node
|
# upwards always ends at the root node
|
||||||
for tokenid in range(len(heads)):
|
for tokenid in range(len(heads)):
|
||||||
seen = set([tokenid])
|
seen = set([tokenid])
|
||||||
for ancestor in ancestors(tokenid,heads):
|
for ancestor in ancestors(tokenid, heads):
|
||||||
if ancestor in seen:
|
if ancestor in seen:
|
||||||
return seen
|
return seen
|
||||||
seen.add(ancestor)
|
seen.add(ancestor)
|
||||||
|
@ -45,26 +43,26 @@ def is_nonproj_arc(tokenid, heads):
|
||||||
# if there is a token k, h < k < d such that h is not
|
# if there is a token k, h < k < d such that h is not
|
||||||
# an ancestor of k. Same for h -> d, h > d
|
# an ancestor of k. Same for h -> d, h > d
|
||||||
head = heads[tokenid]
|
head = heads[tokenid]
|
||||||
if head == tokenid: # root arcs cannot be non-projective
|
if head == tokenid: # root arcs cannot be non-projective
|
||||||
return False
|
return False
|
||||||
elif head == None: # unattached tokens cannot be non-projective
|
elif head is None: # unattached tokens cannot be non-projective
|
||||||
return False
|
return False
|
||||||
|
|
||||||
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
|
start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
|
||||||
for k in range(start,end):
|
for k in range(start, end):
|
||||||
for ancestor in ancestors(k,heads):
|
for ancestor in ancestors(k, heads):
|
||||||
if ancestor == None: # for unattached tokens/subtrees
|
if ancestor is None: # for unattached tokens/subtrees
|
||||||
break
|
break
|
||||||
elif ancestor == head: # normal case: k dominated by h
|
elif ancestor == head: # normal case: k dominated by h
|
||||||
break
|
break
|
||||||
else: # head not in ancestors: d -> h is non-projective
|
else: # head not in ancestors: d -> h is non-projective
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def is_nonproj_tree(heads):
|
def is_nonproj_tree(heads):
|
||||||
# a tree is non-projective if at least one arc is non-projective
|
# a tree is non-projective if at least one arc is non-projective
|
||||||
return any( is_nonproj_arc(word,heads) for word in range(len(heads)) )
|
return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
|
||||||
|
|
||||||
|
|
||||||
def decompose(label):
|
def decompose(label):
|
||||||
|
@ -81,32 +79,32 @@ def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
|
||||||
for raw_text, sents in gold_tuples:
|
for raw_text, sents in gold_tuples:
|
||||||
prepro_sents = []
|
prepro_sents = []
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||||
proj_heads,deco_labels = projectivize(heads,labels)
|
proj_heads, deco_labels = projectivize(heads, labels)
|
||||||
# set the label to ROOT for each root dependent
|
# set the label to ROOT for each root dependent
|
||||||
deco_labels = [ 'ROOT' if head == i else deco_labels[i] for i,head in enumerate(proj_heads) ]
|
deco_labels = ['ROOT' if head == i else deco_labels[i]
|
||||||
|
for i, head in enumerate(proj_heads)]
|
||||||
# count label frequencies
|
# count label frequencies
|
||||||
if label_freq_cutoff > 0:
|
if label_freq_cutoff > 0:
|
||||||
for label in deco_labels:
|
for label in deco_labels:
|
||||||
if is_decorated(label):
|
if is_decorated(label):
|
||||||
freqs[label] = freqs.get(label,0) + 1
|
freqs[label] = freqs.get(label, 0) + 1
|
||||||
prepro_sents.append(((ids,words,tags,proj_heads,deco_labels,iob), ctnts))
|
prepro_sents.append(
|
||||||
|
((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
|
||||||
preprocessed.append((raw_text, prepro_sents))
|
preprocessed.append((raw_text, prepro_sents))
|
||||||
|
|
||||||
if label_freq_cutoff > 0:
|
if label_freq_cutoff > 0:
|
||||||
return _filter_labels(preprocessed,label_freq_cutoff,freqs)
|
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
|
||||||
return preprocessed
|
return preprocessed
|
||||||
|
|
||||||
|
|
||||||
def projectivize(heads, labels):
|
def projectivize(heads, labels):
|
||||||
# use the algorithm by Nivre & Nilsson 2005
|
# Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
|
||||||
# assumes heads to be a proper tree, i.e. connected and cycle-free
|
# tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
|
||||||
# returns a new pair (heads,labels) which encode
|
# which encode a projective and decorated tree.
|
||||||
# a projective and decorated tree
|
|
||||||
proj_heads = copy(heads)
|
proj_heads = copy(heads)
|
||||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||||
if smallest_np_arc == None: # this sentence is already projective
|
if smallest_np_arc is None: # this sentence is already projective
|
||||||
return proj_heads, copy(labels)
|
return proj_heads, copy(labels)
|
||||||
while smallest_np_arc != None:
|
while smallest_np_arc is not None:
|
||||||
_lift(smallest_np_arc, proj_heads)
|
_lift(smallest_np_arc, proj_heads)
|
||||||
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
|
||||||
deco_labels = _decorate(heads, proj_heads, labels)
|
deco_labels = _decorate(heads, proj_heads, labels)
|
||||||
|
@ -114,24 +112,26 @@ def projectivize(heads, labels):
|
||||||
|
|
||||||
|
|
||||||
def deprojectivize(tokens):
|
def deprojectivize(tokens):
|
||||||
# reattach arcs with decorated labels (following HEAD scheme)
|
# Reattach arcs with decorated labels (following HEAD scheme). For each
|
||||||
# for each decorated arc X||Y, search top-down, left-to-right,
|
# decorated arc X||Y, search top-down, left-to-right, breadth-first until
|
||||||
# breadth-first until hitting a Y then make this the new head
|
# hitting a Y then make this the new head.
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
if is_decorated(token.dep_):
|
if is_decorated(token.dep_):
|
||||||
newlabel,headlabel = decompose(token.dep_)
|
newlabel, headlabel = decompose(token.dep_)
|
||||||
newhead = _find_new_head(token,headlabel)
|
newhead = _find_new_head(token, headlabel)
|
||||||
token.head = newhead
|
token.head = newhead
|
||||||
token.dep_ = newlabel
|
token.dep_ = newlabel
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
def _decorate(heads, proj_heads, labels):
|
def _decorate(heads, proj_heads, labels):
|
||||||
# uses decoration scheme HEAD from Nivre & Nilsson 2005
|
# uses decoration scheme HEAD from Nivre & Nilsson 2005
|
||||||
assert(len(heads) == len(proj_heads) == len(labels))
|
assert(len(heads) == len(proj_heads) == len(labels))
|
||||||
deco_labels = []
|
deco_labels = []
|
||||||
for tokenid,head in enumerate(heads):
|
for tokenid, head in enumerate(heads):
|
||||||
if head != proj_heads[tokenid]:
|
if head != proj_heads[tokenid]:
|
||||||
deco_labels.append('%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
|
deco_labels.append(
|
||||||
|
'%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
|
||||||
else:
|
else:
|
||||||
deco_labels.append(labels[tokenid])
|
deco_labels.append(labels[tokenid])
|
||||||
return deco_labels
|
return deco_labels
|
||||||
|
@ -143,9 +143,9 @@ def _get_smallest_nonproj_arc(heads):
|
||||||
# and ties are broken left to right
|
# and ties are broken left to right
|
||||||
smallest_size = float('inf')
|
smallest_size = float('inf')
|
||||||
smallest_np_arc = None
|
smallest_np_arc = None
|
||||||
for tokenid,head in enumerate(heads):
|
for tokenid, head in enumerate(heads):
|
||||||
size = abs(tokenid-head)
|
size = abs(tokenid-head)
|
||||||
if size < smallest_size and is_nonproj_arc(tokenid,heads):
|
if size < smallest_size and is_nonproj_arc(tokenid, heads):
|
||||||
smallest_size = size
|
smallest_size = size
|
||||||
smallest_np_arc = tokenid
|
smallest_np_arc = tokenid
|
||||||
return smallest_np_arc
|
return smallest_np_arc
|
||||||
|
@ -168,8 +168,10 @@ def _find_new_head(token, headlabel):
|
||||||
next_queue = []
|
next_queue = []
|
||||||
for qtoken in queue:
|
for qtoken in queue:
|
||||||
for child in qtoken.children:
|
for child in qtoken.children:
|
||||||
if child.is_space: continue
|
if child.is_space:
|
||||||
if child == token: continue
|
continue
|
||||||
|
if child == token:
|
||||||
|
continue
|
||||||
if child.dep_ == headlabel:
|
if child.dep_ == headlabel:
|
||||||
return child
|
return child
|
||||||
next_queue.append(child)
|
next_queue.append(child)
|
||||||
|
@ -184,7 +186,10 @@ def _filter_labels(gold_tuples, cutoff, freqs):
|
||||||
for raw_text, sents in gold_tuples:
|
for raw_text, sents in gold_tuples:
|
||||||
filtered_sents = []
|
filtered_sents = []
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
||||||
filtered_labels = [ decompose(label)[0] if freqs.get(label,cutoff) < cutoff else label for label in labels ]
|
filtered_labels = [decompose(label)[0]
|
||||||
filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
|
if freqs.get(label, cutoff) < cutoff
|
||||||
|
else label for label in labels]
|
||||||
|
filtered_sents.append(
|
||||||
|
((ids, words, tags, heads, filtered_labels, iob), ctnts))
|
||||||
filtered.append((raw_text, filtered_sents))
|
filtered.append((raw_text, filtered_sents))
|
||||||
return filtered
|
return filtered
|
||||||
|
|
|
@ -2,17 +2,8 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from libc.string cimport memcpy, memset
|
|
||||||
from libc.stdint cimport uint32_t, uint64_t
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from ..vocab cimport EMPTY_LEXEME
|
|
||||||
from ..structs cimport Entity
|
|
||||||
from ..lexeme cimport Lexeme
|
|
||||||
from ..symbols cimport punct
|
|
||||||
from ..attrs cimport IS_SPACE
|
|
||||||
from ..attrs cimport attr_id_t
|
|
||||||
from ..tokens.token cimport Token
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,17 +2,17 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
|
from cpython.ref cimport Py_INCREF
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from thinc.typedefs cimport weight_t
|
from thinc.typedefs cimport weight_t
|
||||||
from collections import defaultdict, OrderedDict
|
from collections import OrderedDict
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
from .. import util
|
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
|
|
||||||
from ..typedefs cimport attr_t
|
from ..typedefs cimport attr_t
|
||||||
|
from ..compat import json_dumps
|
||||||
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
cdef weight_t MIN_SCORE = -90000
|
cdef weight_t MIN_SCORE = -90000
|
||||||
|
@ -136,11 +136,12 @@ cdef class TransitionSystem:
|
||||||
print([gold.c.ner[i].clas for i in range(gold.length)])
|
print([gold.c.ner[i].clas for i in range(gold.length)])
|
||||||
print([gold.c.ner[i].move for i in range(gold.length)])
|
print([gold.c.ner[i].move for i in range(gold.length)])
|
||||||
print([gold.c.ner[i].label for i in range(gold.length)])
|
print([gold.c.ner[i].label for i in range(gold.length)])
|
||||||
print("Self labels", [self.c[i].label for i in range(self.n_moves)])
|
print("Self labels",
|
||||||
|
[self.c[i].label for i in range(self.n_moves)])
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Could not find a gold-standard action to supervise "
|
"Could not find a gold-standard action to supervise "
|
||||||
"the entity recognizer\n"
|
"the entity recognizer. The transition system has "
|
||||||
"The transition system has %d actions." % (self.n_moves))
|
"%d actions." % (self.n_moves))
|
||||||
|
|
||||||
def get_class_name(self, int clas):
|
def get_class_name(self, int clas):
|
||||||
act = self.c[clas]
|
act = self.c[clas]
|
||||||
|
@ -149,7 +150,7 @@ cdef class TransitionSystem:
|
||||||
def add_action(self, int action, label_name):
|
def add_action(self, int action, label_name):
|
||||||
cdef attr_t label_id
|
cdef attr_t label_id
|
||||||
if not isinstance(label_name, int) and \
|
if not isinstance(label_name, int) and \
|
||||||
not isinstance(label_name, long):
|
not isinstance(label_name, long):
|
||||||
label_id = self.strings.add(label_name)
|
label_id = self.strings.add(label_name)
|
||||||
else:
|
else:
|
||||||
label_id = label_name
|
label_id = label_name
|
||||||
|
@ -186,7 +187,7 @@ cdef class TransitionSystem:
|
||||||
'name': self.move_name(trans.move, trans.label)
|
'name': self.move_name(trans.move, trans.label)
|
||||||
})
|
})
|
||||||
serializers = {
|
serializers = {
|
||||||
'transitions': lambda: ujson.dumps(transitions),
|
'transitions': lambda: json_dumps(transitions),
|
||||||
'strings': lambda: self.strings.to_bytes()
|
'strings': lambda: self.strings.to_bytes()
|
||||||
}
|
}
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_bytes(serializers, exclude)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user