mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
a7aa49c419
|
@ -5,7 +5,7 @@ environment:
|
|||
# For Python versions available on Appveyor, see
|
||||
# http://www.appveyor.com/docs/installed-software#python
|
||||
|
||||
- PYTHON: "C:\\Python27"
|
||||
- PYTHON: "C:\\Python27-x64"
|
||||
#- PYTHON: "C:\\Python34"
|
||||
#- PYTHON: "C:\\Python35"
|
||||
#- PYTHON: "C:\\Python27-x64"
|
||||
|
|
|
@ -22,6 +22,7 @@ install:
|
|||
- pip install flake8
|
||||
|
||||
script:
|
||||
- "cat /proc/cpuinfo | grep flags | head -n 1"
|
||||
- "pip install pytest pytest-timeout"
|
||||
- if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
|
||||
- if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
|
||||
|
|
|
@ -3,7 +3,7 @@ pathlib
|
|||
numpy>=1.7
|
||||
cymem>=1.30,<1.32
|
||||
preshed>=1.0.0,<2.0.0
|
||||
thinc>=6.11.1.dev12,<6.12.0
|
||||
thinc>=6.11.1.dev17,<6.12.0
|
||||
murmurhash>=0.28,<0.29
|
||||
cytoolz>=0.9.0,<0.10.0
|
||||
plac<1.0.0,>=0.9.6
|
||||
|
|
5
setup.py
5
setup.py
|
@ -28,9 +28,10 @@ MOD_NAMES = [
|
|||
'spacy.pipeline',
|
||||
'spacy.syntax.stateclass',
|
||||
'spacy.syntax._state',
|
||||
'spacy.syntax._beam_utils',
|
||||
'spacy.tokenizer',
|
||||
'spacy.syntax.nn_parser',
|
||||
'spacy.syntax._parser_model',
|
||||
'spacy.syntax._beam_utils',
|
||||
'spacy.syntax.nonproj',
|
||||
'spacy.syntax.transition_system',
|
||||
'spacy.syntax.arc_eager',
|
||||
|
@ -191,7 +192,7 @@ def setup_package():
|
|||
'murmurhash>=0.28,<0.29',
|
||||
'cymem>=1.30,<1.32',
|
||||
'preshed>=1.0.0,<2.0.0',
|
||||
'thinc>=6.11.1.dev11,<6.12.0',
|
||||
'thinc>=6.11.1.dev17,<6.12.0',
|
||||
'plac<1.0.0,>=0.9.6',
|
||||
'pathlib',
|
||||
'ujson>=1.35',
|
||||
|
|
|
@ -16,10 +16,12 @@ from ..gold import GoldParse
|
|||
from ..util import compounding, minibatch_by_words
|
||||
from ..syntax.nonproj import projectivize
|
||||
from ..matcher import Matcher
|
||||
from ..morphology import Fused_begin, Fused_inside
|
||||
#from ..morphology import Fused_begin, Fused_inside
|
||||
from .. import displacy
|
||||
from collections import defaultdict, Counter
|
||||
from timeit import default_timer as timer
|
||||
Fused_begin = None
|
||||
Fused_inside = None
|
||||
|
||||
import itertools
|
||||
import random
|
||||
|
@ -254,12 +256,6 @@ def get_token_split_end(token):
|
|||
return token.nbor(i-1)
|
||||
|
||||
|
||||
Token.set_extension('split_start', getter=get_token_split_start)
|
||||
Token.set_extension('split_end', getter=get_token_split_end)
|
||||
Token.set_extension('begins_fused', default=False)
|
||||
Token.set_extension('inside_fused', default=False)
|
||||
|
||||
|
||||
##################
|
||||
# Initialization #
|
||||
##################
|
||||
|
@ -280,6 +276,10 @@ def initialize_pipeline(nlp, docs, golds, config, device):
|
|||
corpus=("UD corpus to evaluate, e.g. UD_English, UD_Spanish, etc", "positional", None, str),
|
||||
)
|
||||
def main(test_data_dir, experiment_dir, corpus):
|
||||
Token.set_extension('split_start', getter=get_token_split_start)
|
||||
Token.set_extension('split_end', getter=get_token_split_end)
|
||||
Token.set_extension('begins_fused', default=False)
|
||||
Token.set_extension('inside_fused', default=False)
|
||||
lang.zh.Chinese.Defaults.use_jieba = False
|
||||
lang.ja.Japanese.Defaults.use_janome = False
|
||||
lang.ru.Russian.Defaults.use_pymorphy2 = False
|
||||
|
|
|
@ -170,9 +170,19 @@ def golds_to_gold_tuples(docs, golds):
|
|||
##############
|
||||
|
||||
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
|
||||
with text_loc.open('r', encoding='utf8') as text_file:
|
||||
texts = split_text(text_file.read())
|
||||
docs = list(nlp.pipe(texts))
|
||||
if text_loc.parts[-1].endswith('.conllu'):
|
||||
docs = []
|
||||
with text_loc.open() as file_:
|
||||
for conllu_doc in read_conllu(file_):
|
||||
for conllu_sent in conllu_doc:
|
||||
words = [line[1] for line in conllu_sent]
|
||||
docs.append(Doc(nlp.vocab, words=words))
|
||||
for name, component in nlp.pipeline:
|
||||
docs = list(component.pipe(docs))
|
||||
else:
|
||||
with text_loc.open('r', encoding='utf8') as text_file:
|
||||
texts = split_text(text_file.read())
|
||||
docs = list(nlp.pipe(texts))
|
||||
with sys_loc.open('w', encoding='utf8') as out_file:
|
||||
write_conllu(docs, out_file)
|
||||
with gold_loc.open('r', encoding='utf8') as gold_file:
|
||||
|
@ -270,12 +280,12 @@ def load_nlp(corpus, config, vectors=None):
|
|||
|
||||
|
||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
||||
nlp.add_pipe(nlp.create_pipe('tagger'))
|
||||
nlp.add_pipe(nlp.create_pipe('parser'))
|
||||
if config.multitask_tag:
|
||||
nlp.parser.add_multitask_objective('tag')
|
||||
if config.multitask_sent:
|
||||
nlp.parser.add_multitask_objective('sent_start')
|
||||
nlp.add_pipe(nlp.create_pipe('tagger'))
|
||||
for gold in golds:
|
||||
for tag in gold.tags:
|
||||
if tag is not None:
|
||||
|
@ -337,10 +347,12 @@ class TreebankPaths(object):
|
|||
config=("Path to json formatted config file", "positional"),
|
||||
limit=("Size limit", "option", "n", int),
|
||||
use_gpu=("Use GPU", "option", "g", int),
|
||||
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
||||
vectors_dir=("Path to directory with pre-trained vectors, named e.g. en/",
|
||||
"option", "v", Path),
|
||||
)
|
||||
def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1, vectors_dir=None):
|
||||
def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1, vectors_dir=None,
|
||||
use_oracle_segments=False):
|
||||
spacy.util.fix_random_seed()
|
||||
lang.zh.Chinese.Defaults.use_jieba = False
|
||||
lang.ja.Japanese.Defaults.use_janome = False
|
||||
|
@ -353,13 +365,17 @@ def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1, vectors_dir=No
|
|||
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
||||
|
||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
||||
max_doc_length=config.max_doc_length, limit=limit)
|
||||
max_doc_length=None, limit=limit)
|
||||
|
||||
optimizer = initialize_pipeline(nlp, docs, golds, config, use_gpu)
|
||||
|
||||
batch_sizes = compounding(config.batch_size//10, config.batch_size, 1.001)
|
||||
nlp.parser.cfg['beam_update_prob'] = 1.0
|
||||
for i in range(config.nr_epoch):
|
||||
docs = [nlp.make_doc(doc.text) for doc in docs]
|
||||
docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
|
||||
max_doc_length=config.max_doc_length, limit=limit,
|
||||
oracle_segments=use_oracle_segments,
|
||||
raw_text=not use_oracle_segments)
|
||||
Xs = list(zip(docs, golds))
|
||||
random.shuffle(Xs)
|
||||
batches = minibatch_by_words(Xs, size=batch_sizes)
|
||||
|
@ -374,7 +390,12 @@ def main(ud_dir, parses_dir, config, corpus, limit=0, use_gpu=-1, vectors_dir=No
|
|||
|
||||
out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
parsed_docs, scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
|
||||
if use_oracle_segments:
|
||||
parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
|
||||
paths.dev.conllu, out_path)
|
||||
else:
|
||||
parsed_docs, scores = evaluate(nlp, paths.dev.text,
|
||||
paths.dev.conllu, out_path)
|
||||
print_progress(i, losses, scores)
|
||||
_render_parses(i, parsed_docs[:50])
|
||||
|
||||
|
|
|
@ -160,7 +160,7 @@ class GoldCorpus(object):
|
|||
yield item
|
||||
i += len(item[1])
|
||||
if limit and i >= limit:
|
||||
break
|
||||
return
|
||||
|
||||
@property
|
||||
def dev_tuples(self):
|
||||
|
@ -178,9 +178,9 @@ class GoldCorpus(object):
|
|||
for raw_text, paragraph_tuples in self.train_tuples:
|
||||
for sent_tuples, brackets in paragraph_tuples:
|
||||
n += len(sent_tuples[1])
|
||||
if self.limit and i >= self.limit:
|
||||
break
|
||||
i += len(paragraph_tuples)
|
||||
if self.limit and i >= self.limit:
|
||||
break
|
||||
i += 1
|
||||
return n
|
||||
|
||||
def train_docs(self, nlp, gold_preproc=False, max_length=None,
|
||||
|
@ -394,7 +394,7 @@ cdef class GoldParse:
|
|||
|
||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None,
|
||||
heads=None, deps=None, entities=None, make_projective=False,
|
||||
cats=None):
|
||||
cats=None, **_):
|
||||
"""Create a GoldParse.
|
||||
|
||||
doc (Doc): The document the annotations refer to.
|
||||
|
|
6
spacy/syntax/_beam_utils.pxd
Normal file
6
spacy/syntax/_beam_utils.pxd
Normal file
|
@ -0,0 +1,6 @@
|
|||
from thinc.typedefs cimport class_t
|
||||
|
||||
# These are passed as callbacks to thinc.search.Beam
|
||||
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1
|
||||
|
||||
cdef int check_final_state(void* _state, void* extra_args) except -1
|
|
@ -15,7 +15,7 @@ from .stateclass cimport StateC, StateClass
|
|||
|
||||
|
||||
# These are passed as callbacks to thinc.search.Beam
|
||||
cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||
cdef int transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
|
||||
dest = <StateC*>_dest
|
||||
src = <StateC*>_src
|
||||
moves = <const Transition*>_moves
|
||||
|
@ -24,12 +24,12 @@ cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves)
|
|||
dest.push_hist(clas)
|
||||
|
||||
|
||||
cdef int _check_final_state(void* _state, void* extra_args) except -1:
|
||||
cdef int check_final_state(void* _state, void* extra_args) except -1:
|
||||
state = <StateC*>_state
|
||||
return state.is_final()
|
||||
|
||||
|
||||
cdef hash_t _hash_state(void* _state, void* _) except 0:
|
||||
cdef hash_t hash_state(void* _state, void* _) except 0:
|
||||
state = <StateC*>_state
|
||||
if state.is_final():
|
||||
return 1
|
||||
|
@ -37,6 +37,20 @@ cdef hash_t _hash_state(void* _state, void* _) except 0:
|
|||
return state.hash()
|
||||
|
||||
|
||||
def collect_states(beams):
|
||||
cdef StateClass state
|
||||
cdef Beam beam
|
||||
states = []
|
||||
for state_or_beam in beams:
|
||||
if isinstance(state_or_beam, StateClass):
|
||||
states.append(state_or_beam)
|
||||
else:
|
||||
beam = state_or_beam
|
||||
state = StateClass.borrow(<StateC*>beam.at(0))
|
||||
states.append(state)
|
||||
return states
|
||||
|
||||
|
||||
cdef class ParserBeam(object):
|
||||
cdef public TransitionSystem moves
|
||||
cdef public object states
|
||||
|
@ -45,7 +59,7 @@ cdef class ParserBeam(object):
|
|||
cdef public object dones
|
||||
|
||||
def __init__(self, TransitionSystem moves, states, golds,
|
||||
int width, float density):
|
||||
int width, float density=0.):
|
||||
self.moves = moves
|
||||
self.states = states
|
||||
self.golds = golds
|
||||
|
@ -54,7 +68,7 @@ cdef class ParserBeam(object):
|
|||
cdef StateClass state
|
||||
cdef StateC* st
|
||||
for state in states:
|
||||
beam = Beam(self.moves.n_moves, width, density)
|
||||
beam = Beam(self.moves.n_moves, width, min_density=density)
|
||||
beam.initialize(self.moves.init_beam_state, state.c.length,
|
||||
state.c._sent)
|
||||
for i in range(beam.width):
|
||||
|
@ -82,8 +96,8 @@ cdef class ParserBeam(object):
|
|||
self._set_scores(beam, scores[i])
|
||||
if self.golds is not None:
|
||||
self._set_costs(beam, self.golds[i], follow_gold=follow_gold)
|
||||
beam.advance(_transition_state, NULL, <void*>self.moves.c)
|
||||
beam.check_done(_check_final_state, NULL)
|
||||
beam.advance(transition_state, NULL, <void*>self.moves.c)
|
||||
beam.check_done(check_final_state, NULL)
|
||||
# This handles the non-monotonic stuff for the parser.
|
||||
if beam.is_done and self.golds is not None:
|
||||
for j in range(beam.size):
|
||||
|
@ -92,8 +106,6 @@ cdef class ParserBeam(object):
|
|||
try:
|
||||
if self.moves.is_gold_parse(state, self.golds[i]):
|
||||
beam._states[j].loss = 0.0
|
||||
elif beam._states[j].loss == 0.0:
|
||||
beam._states[j].loss = 1.0
|
||||
except NotImplementedError:
|
||||
break
|
||||
|
||||
|
@ -119,8 +131,12 @@ cdef class ParserBeam(object):
|
|||
self.moves.set_costs(beam.is_valid[i], beam.costs[i],
|
||||
state, gold)
|
||||
if follow_gold:
|
||||
min_cost = 0
|
||||
for j in range(beam.nr_class):
|
||||
if beam.costs[i][j] >= 1:
|
||||
if beam.is_valid[i][j] and beam.costs[i][j] < min_cost:
|
||||
min_cost = beam.costs[i][j]
|
||||
for j in range(beam.nr_class):
|
||||
if beam.costs[i][j] > min_cost:
|
||||
beam.is_valid[i][j] = 0
|
||||
|
||||
|
||||
|
@ -144,15 +160,13 @@ nr_update = 0
|
|||
def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
||||
states, golds,
|
||||
state2vec, vec2scores,
|
||||
int width, float density, int hist_feats,
|
||||
losses=None, drop=0.):
|
||||
int width, losses=None, drop=0.,
|
||||
early_update=True, beam_density=0.0):
|
||||
global nr_update
|
||||
cdef MaxViolation violn
|
||||
nr_update += 1
|
||||
pbeam = ParserBeam(moves, states, golds,
|
||||
width=width, density=density)
|
||||
gbeam = ParserBeam(moves, states, golds,
|
||||
width=width, density=density)
|
||||
pbeam = ParserBeam(moves, states, golds, width=width, density=beam_density)
|
||||
gbeam = ParserBeam(moves, states, golds, width=width, density=beam_density)
|
||||
cdef StateClass state
|
||||
beam_maps = []
|
||||
backprops = []
|
||||
|
@ -177,13 +191,7 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
|||
# Now that we have our flat list of states, feed them through the model
|
||||
token_ids = get_token_ids(states, nr_feature)
|
||||
vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop)
|
||||
if hist_feats:
|
||||
hists = numpy.asarray([st.history[:hist_feats] for st in states],
|
||||
dtype='i')
|
||||
scores, bp_scores = vec2scores.begin_update((vectors, hists),
|
||||
drop=drop)
|
||||
else:
|
||||
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
||||
scores, bp_scores = vec2scores.begin_update(vectors, drop=drop)
|
||||
|
||||
# Store the callbacks for the backward pass
|
||||
backprops.append((token_ids, bp_vectors, bp_scores))
|
||||
|
@ -194,13 +202,17 @@ def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
|
|||
for indices in p_indices]
|
||||
g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')
|
||||
for indices in g_indices]
|
||||
# Now advance the states in the beams. The gold beam is contrained to
|
||||
# Now advance the states in the beams. The gold beam is constrained to
|
||||
# to follow only gold analyses.
|
||||
pbeam.advance(p_scores)
|
||||
gbeam.advance(g_scores, follow_gold=True)
|
||||
# Track the "maximum violation", to use in the update.
|
||||
for i, violn in enumerate(violns):
|
||||
violn.check_crf(pbeam[i], gbeam[i])
|
||||
# Use 'early update' if best gold is way out of contention.
|
||||
if pbeam[i].loss > 0 and pbeam[i].min_score > (gbeam[i].score * 5.00):
|
||||
pbeam.dones[i] = True
|
||||
gbeam.dones[i] = True
|
||||
histories = []
|
||||
losses = []
|
||||
for violn in violns:
|
||||
|
@ -264,14 +276,15 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
|||
Each batch has multiple beams
|
||||
So history is list of lists of lists of ints
|
||||
"""
|
||||
nr_step = len(beam_maps)
|
||||
grads = []
|
||||
nr_step = 0
|
||||
nr_steps = []
|
||||
for eg_id, hists in enumerate(histories):
|
||||
nr_step = 0
|
||||
for loss, hist in zip(losses[eg_id], hists):
|
||||
if loss != 0.0 and not numpy.isnan(loss):
|
||||
nr_step = max(nr_step, len(hist))
|
||||
for i in range(nr_step):
|
||||
nr_steps.append(nr_step)
|
||||
for i in range(max(nr_steps)):
|
||||
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
|
||||
dtype='f'))
|
||||
if len(histories) != len(losses):
|
||||
|
@ -282,8 +295,11 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
|||
continue
|
||||
key = tuple([eg_id])
|
||||
# Adjust loss for length
|
||||
# We need to do this because each state in a short path is scored
|
||||
# multiple times, as we add in the average cost when we run out
|
||||
# of actions.
|
||||
avg_loss = loss / len(hist)
|
||||
loss += avg_loss * (nr_step - len(hist))
|
||||
loss += avg_loss * (nr_steps[eg_id] - len(hist))
|
||||
for j, clas in enumerate(hist):
|
||||
i = beam_maps[j][key]
|
||||
# In step j, at state i action clas
|
||||
|
@ -291,3 +307,27 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
|||
grads[j][i, clas] += loss
|
||||
key = key + tuple([clas])
|
||||
return grads
|
||||
|
||||
|
||||
def cleanup_beam(Beam beam):
|
||||
cdef StateC* state
|
||||
# Once parsing has finished, states in beam may not be unique. Is this
|
||||
# correct?
|
||||
seen = set()
|
||||
for i in range(beam.width):
|
||||
addr = <size_t>beam._parents[i].content
|
||||
if addr not in seen:
|
||||
state = <StateC*>addr
|
||||
del state
|
||||
seen.add(addr)
|
||||
else:
|
||||
raise ValueError(Errors.E023.format(addr=addr, i=i))
|
||||
addr = <size_t>beam._states[i].content
|
||||
if addr not in seen:
|
||||
state = <StateC*>addr
|
||||
del state
|
||||
seen.add(addr)
|
||||
else:
|
||||
raise ValueError(Errors.E023.format(addr=addr, i=i))
|
||||
|
||||
|
||||
|
|
49
spacy/syntax/_parser_model.pxd
Normal file
49
spacy/syntax/_parser_model.pxd
Normal file
|
@ -0,0 +1,49 @@
|
|||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
from thinc.typedefs cimport weight_t, class_t, hash_t
|
||||
|
||||
from ._state cimport StateC
|
||||
|
||||
|
||||
cdef struct SizesC:
|
||||
int states
|
||||
int classes
|
||||
int hiddens
|
||||
int pieces
|
||||
int feats
|
||||
int embed_width
|
||||
|
||||
|
||||
cdef struct WeightsC:
|
||||
const float* feat_weights
|
||||
const float* feat_bias
|
||||
const float* hidden_bias
|
||||
const float* hidden_weights
|
||||
const float* vectors
|
||||
|
||||
|
||||
cdef struct ActivationsC:
|
||||
int* token_ids
|
||||
float* vectors
|
||||
float* unmaxed
|
||||
float* scores
|
||||
float* hiddens
|
||||
int* is_valid
|
||||
int _curr_size
|
||||
int _max_size
|
||||
|
||||
|
||||
cdef WeightsC get_c_weights(model) except *
|
||||
|
||||
cdef SizesC get_c_sizes(model, int batch_size) except *
|
||||
|
||||
cdef void resize_activations(ActivationsC* A, SizesC n) nogil
|
||||
|
||||
cdef void predict_states(ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil
|
||||
|
||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
|
||||
|
||||
cdef void cpu_log_loss(float* d_scores,
|
||||
const float* costs, const int* is_valid, const float* scores, int O) nogil
|
||||
|
402
spacy/syntax/_parser_model.pyx
Normal file
402
spacy/syntax/_parser_model.pyx
Normal file
|
@ -0,0 +1,402 @@
|
|||
# cython: infer_types=True
|
||||
# cython: cdivision=True
|
||||
# cython: boundscheck=False
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from collections import OrderedDict
|
||||
import ujson
|
||||
import json
|
||||
import numpy
|
||||
cimport cython.parallel
|
||||
import cytoolz
|
||||
import numpy.random
|
||||
cimport numpy as np
|
||||
from libc.math cimport exp
|
||||
from libcpp.vector cimport vector
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t, class_t, hash_t
|
||||
from thinc.extra.search cimport Beam
|
||||
from thinc.api import chain, clone
|
||||
from thinc.v2v import Model, Maxout, Affine
|
||||
from thinc.misc import LayerNorm
|
||||
from thinc.neural.ops import CupyOps
|
||||
from thinc.neural.util import get_array_module
|
||||
from thinc.linalg cimport Vec, VecVec
|
||||
from thinc cimport openblas
|
||||
|
||||
|
||||
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
||||
from .._ml import link_vectors_to_models, create_default_optimizer
|
||||
from ..compat import json_dumps, copy_array
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..gold cimport GoldParse
|
||||
from ..errors import Errors, TempErrors
|
||||
from .. import util
|
||||
from .stateclass cimport StateClass
|
||||
from .transition_system cimport Transition
|
||||
from . import _beam_utils
|
||||
from . import nonproj
|
||||
|
||||
|
||||
cdef WeightsC get_c_weights(model) except *:
|
||||
cdef WeightsC output
|
||||
cdef precompute_hiddens state2vec = model.state2vec
|
||||
output.feat_weights = state2vec.get_feat_weights()
|
||||
output.feat_bias = <const float*>state2vec.bias.data
|
||||
cdef np.ndarray vec2scores_W = model.vec2scores.W
|
||||
cdef np.ndarray vec2scores_b = model.vec2scores.b
|
||||
output.hidden_weights = <const float*>vec2scores_W.data
|
||||
output.hidden_bias = <const float*>vec2scores_b.data
|
||||
cdef np.ndarray tokvecs = model.tokvecs
|
||||
output.vectors = <float*>tokvecs.data
|
||||
return output
|
||||
|
||||
|
||||
cdef SizesC get_c_sizes(model, int batch_size) except *:
|
||||
cdef SizesC output
|
||||
output.states = batch_size
|
||||
output.classes = model.vec2scores.nO
|
||||
output.hiddens = model.state2vec.nO
|
||||
output.pieces = model.state2vec.nP
|
||||
output.feats = model.state2vec.nF
|
||||
output.embed_width = model.tokvecs.shape[1]
|
||||
return output
|
||||
|
||||
|
||||
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||
if n.states <= A._max_size:
|
||||
A._curr_size = n.states
|
||||
return
|
||||
if A._max_size == 0:
|
||||
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
||||
A.vectors = <float*>calloc(n.states * n.embed_width, sizeof(A.vectors[0]))
|
||||
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
|
||||
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
else:
|
||||
A.token_ids = <int*>realloc(A.token_ids,
|
||||
n.states * n.feats * sizeof(A.token_ids[0]))
|
||||
A.vectors = <float*>realloc(A.vectors,
|
||||
n.states * n.embed_width * sizeof(A.vectors[0]))
|
||||
A.scores = <float*>realloc(A.scores,
|
||||
n.states * n.classes * sizeof(A.scores[0]))
|
||||
A.unmaxed = <float*>realloc(A.unmaxed,
|
||||
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>realloc(A.hiddens,
|
||||
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>realloc(A.is_valid,
|
||||
n.states * n.classes * sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
A._curr_size = n.states
|
||||
|
||||
|
||||
cdef void predict_states(ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil:
|
||||
resize_activations(A, n)
|
||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
||||
for i in range(n.states):
|
||||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||
sum_state_features(A.unmaxed,
|
||||
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
|
||||
W.feat_bias, 1., n.hiddens * n.pieces)
|
||||
for j in range(n.hiddens):
|
||||
index = i * n.hiddens * n.pieces + j * n.pieces
|
||||
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
|
||||
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
||||
memset(A.scores, 0, n.states * n.classes * sizeof(float))
|
||||
# Compute hidden-to-output
|
||||
openblas.simple_gemm(A.scores, n.states, n.classes,
|
||||
A.hiddens, n.states, n.hiddens,
|
||||
W.hidden_weights, n.classes, n.hiddens, 0, 1)
|
||||
# Add bias
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.scores[i*n.classes],
|
||||
W.hidden_bias, 1., n.classes)
|
||||
|
||||
|
||||
cdef void sum_state_features(float* output,
|
||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||
cdef int idx, b, f, i
|
||||
cdef const float* feature
|
||||
padding = cached
|
||||
cached += F * O
|
||||
cdef int id_stride = F*O
|
||||
cdef float one = 1.
|
||||
for b in range(B):
|
||||
for f in range(F):
|
||||
if token_ids[f] < 0:
|
||||
feature = &padding[f*O]
|
||||
else:
|
||||
idx = token_ids[f] * id_stride + f*O
|
||||
feature = &cached[idx]
|
||||
openblas.simple_axpy(&output[b*O], O,
|
||||
feature, one)
|
||||
token_ids += F
|
||||
|
||||
|
||||
cdef void cpu_log_loss(float* d_scores,
|
||||
const float* costs, const int* is_valid, const float* scores,
|
||||
int O) nogil:
|
||||
"""Do multi-label log loss"""
|
||||
cdef double max_, gmax, Z, gZ
|
||||
best = arg_max_if_gold(scores, costs, is_valid, O)
|
||||
guess = arg_max_if_valid(scores, is_valid, O)
|
||||
Z = 1e-10
|
||||
gZ = 1e-10
|
||||
max_ = scores[guess]
|
||||
gmax = scores[best]
|
||||
for i in range(O):
|
||||
if is_valid[i]:
|
||||
Z += exp(scores[i] - max_)
|
||||
if costs[i] <= costs[best]:
|
||||
gZ += exp(scores[i] - gmax)
|
||||
for i in range(O):
|
||||
if not is_valid[i]:
|
||||
d_scores[i] = 0.
|
||||
elif costs[i] <= costs[best]:
|
||||
d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
|
||||
else:
|
||||
d_scores[i] = exp(scores[i]-max_) / Z
|
||||
|
||||
|
||||
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
||||
const int* is_valid, int n) nogil:
|
||||
# Find minimum cost
|
||||
cdef float cost = 1
|
||||
for i in range(n):
|
||||
if is_valid[i] and costs[i] < cost:
|
||||
cost = costs[i]
|
||||
# Now find best-scoring with that cost
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if costs[i] <= cost and is_valid[i]:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if is_valid[i] >= 1:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
class ParserModel(Model):
|
||||
def __init__(self, tok2vec, lower_model, upper_model):
|
||||
Model.__init__(self)
|
||||
self._layers = [tok2vec, lower_model, upper_model]
|
||||
|
||||
def begin_update(self, docs, drop=0.):
|
||||
step_model = ParserStepModel(docs, self._layers, drop=drop)
|
||||
def finish_parser_update(golds, sgd=None):
|
||||
step_model.make_updates(sgd)
|
||||
return None
|
||||
return step_model, finish_parser_update
|
||||
|
||||
def resize_output(self, new_output):
|
||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||
# just adding rows here.
|
||||
smaller = self._layers[-1]._layers[-1]
|
||||
larger = Affine(self.moves.n_moves, smaller.nI)
|
||||
copy_array(larger.W[:smaller.nO], smaller.W)
|
||||
copy_array(larger.b[:smaller.nO], smaller.b)
|
||||
self._layers[-1]._layers[-1] = larger
|
||||
|
||||
@property
|
||||
def tok2vec(self):
|
||||
return self._layers[0]
|
||||
|
||||
@property
|
||||
def lower(self):
|
||||
return self._layers[1]
|
||||
|
||||
@property
|
||||
def upper(self):
|
||||
return self._layers[2]
|
||||
|
||||
|
||||
class ParserStepModel(Model):
|
||||
def __init__(self, docs, layers, drop=0.):
|
||||
self.tokvecs, self.bp_tokvecs = layers[0].begin_update(docs, drop=drop)
|
||||
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
|
||||
drop=drop)
|
||||
self.vec2scores = layers[-1]
|
||||
self.cuda_stream = util.get_cuda_stream()
|
||||
self.backprops = []
|
||||
|
||||
@property
|
||||
def nO(self):
|
||||
return self.state2vec.nO
|
||||
|
||||
def begin_update(self, states, drop=0.):
|
||||
token_ids = self.get_token_ids(states)
|
||||
vector, get_d_tokvecs = self.state2vec.begin_update(token_ids, drop=0.0)
|
||||
mask = self.ops.get_dropout_mask(vector.shape, drop)
|
||||
if mask is not None:
|
||||
vector *= mask
|
||||
scores, get_d_vector = self.vec2scores.begin_update(vector, drop=drop)
|
||||
|
||||
def backprop_parser_step(d_scores, sgd=None):
|
||||
d_vector = get_d_vector(d_scores, sgd=sgd)
|
||||
if mask is not None:
|
||||
d_vector *= mask
|
||||
if isinstance(self.ops, CupyOps) \
|
||||
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
||||
# Move token_ids and d_vector to GPU, asynchronously
|
||||
self.backprops.append((
|
||||
util.get_async(self.cuda_stream, token_ids),
|
||||
util.get_async(self.cuda_stream, d_vector),
|
||||
get_d_tokvecs
|
||||
))
|
||||
else:
|
||||
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
||||
return None
|
||||
return scores, backprop_parser_step
|
||||
|
||||
def get_token_ids(self, batch):
|
||||
states = _beam_utils.collect_states(batch)
|
||||
cdef StateClass state
|
||||
states = [state for state in states if not state.is_final()]
|
||||
cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
|
||||
dtype='i', order='C')
|
||||
ids.fill(-1)
|
||||
c_ids = <int*>ids.data
|
||||
for state in states:
|
||||
state.c.set_context_tokens(c_ids, ids.shape[1])
|
||||
c_ids += ids.shape[1]
|
||||
return ids
|
||||
|
||||
def make_updates(self, sgd):
|
||||
# Tells CUDA to block, so our async copies complete.
|
||||
if self.cuda_stream is not None:
|
||||
self.cuda_stream.synchronize()
|
||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
||||
# values don't affect the real gradient.
|
||||
d_tokvecs = self.ops.allocate((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
|
||||
for ids, d_vector, bp_vector in self.backprops:
|
||||
d_state_features = bp_vector((d_vector, ids), sgd=sgd)
|
||||
ids = ids.flatten()
|
||||
d_state_features = d_state_features.reshape(
|
||||
(ids.size, d_state_features.shape[2]))
|
||||
self.ops.scatter_add(d_tokvecs, ids,
|
||||
d_state_features)
|
||||
# Padded -- see update()
|
||||
self.bp_tokvecs(d_tokvecs[:-1], sgd=sgd)
|
||||
return d_tokvecs
|
||||
|
||||
|
||||
cdef class precompute_hiddens:
|
||||
"""Allow a model to be "primed" by pre-computing input features in bulk.
|
||||
|
||||
This is used for the parser, where we want to take a batch of documents,
|
||||
and compute vectors for each (token, position) pair. These vectors can then
|
||||
be reused, especially for beam-search.
|
||||
|
||||
Let's say we're using 12 features for each state, e.g. word at start of
|
||||
buffer, three words on stack, their children, etc. In the normal arc-eager
|
||||
system, a document of length N is processed in 2*N states. This means we'll
|
||||
create 2*N*12 feature vectors --- but if we pre-compute, we only need
|
||||
N*12 vector computations. The saving for beam-search is much better:
|
||||
if we have a beam of k, we'll normally make 2*N*12*K computations --
|
||||
so we can save the factor k. This also gives a nice CPU/GPU division:
|
||||
we can do all our hard maths up front, packed into large multiplications,
|
||||
and do the hard-to-program parsing on the CPU.
|
||||
"""
|
||||
cdef readonly int nF, nO, nP
|
||||
cdef bint _is_synchronized
|
||||
cdef public object ops
|
||||
cdef np.ndarray _features
|
||||
cdef np.ndarray _cached
|
||||
cdef np.ndarray bias
|
||||
cdef object _cuda_stream
|
||||
cdef object _bp_hiddens
|
||||
|
||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
||||
drop=0.):
|
||||
gpu_cached, bp_features = lower_model.begin_update(tokvecs, drop=drop)
|
||||
cdef np.ndarray cached
|
||||
if not isinstance(gpu_cached, numpy.ndarray):
|
||||
# Note the passing of cuda_stream here: it lets
|
||||
# cupy make the copy asynchronously.
|
||||
# We then have to block before first use.
|
||||
cached = gpu_cached.get(stream=cuda_stream)
|
||||
else:
|
||||
cached = gpu_cached
|
||||
if not isinstance(lower_model.b, numpy.ndarray):
|
||||
self.bias = lower_model.b.get()
|
||||
else:
|
||||
self.bias = lower_model.b
|
||||
self.nF = cached.shape[1]
|
||||
self.nP = getattr(lower_model, 'nP', 1)
|
||||
self.nO = cached.shape[2]
|
||||
self.ops = lower_model.ops
|
||||
self._is_synchronized = False
|
||||
self._cuda_stream = cuda_stream
|
||||
self._cached = cached
|
||||
self._bp_hiddens = bp_features
|
||||
|
||||
cdef const float* get_feat_weights(self) except NULL:
|
||||
if not self._is_synchronized and self._cuda_stream is not None:
|
||||
self._cuda_stream.synchronize()
|
||||
self._is_synchronized = True
|
||||
return <float*>self._cached.data
|
||||
|
||||
def __call__(self, X):
|
||||
return self.begin_update(X)[0]
|
||||
|
||||
def begin_update(self, token_ids, drop=0.):
|
||||
cdef np.ndarray state_vector = numpy.zeros(
|
||||
(token_ids.shape[0], self.nO, self.nP), dtype='f')
|
||||
# This is tricky, but (assuming GPU available);
|
||||
# - Input to forward on CPU
|
||||
# - Output from forward on CPU
|
||||
# - Input to backward on GPU!
|
||||
# - Output from backward on GPU
|
||||
bp_hiddens = self._bp_hiddens
|
||||
|
||||
feat_weights = self.get_feat_weights()
|
||||
cdef int[:, ::1] ids = token_ids
|
||||
sum_state_features(<float*>state_vector.data,
|
||||
feat_weights, &ids[0,0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
state_vector += self.bias
|
||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||
|
||||
def backward(d_state_vector_ids, sgd=None):
|
||||
d_state_vector, token_ids = d_state_vector_ids
|
||||
d_state_vector = bp_nonlinearity(d_state_vector, sgd)
|
||||
# This will usually be on GPU
|
||||
if not isinstance(d_state_vector, self.ops.xp.ndarray):
|
||||
d_state_vector = self.ops.xp.array(d_state_vector)
|
||||
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
|
||||
return d_tokens
|
||||
return state_vector, backward
|
||||
|
||||
def _nonlinearity(self, state_vector):
|
||||
if self.nP == 1:
|
||||
state_vector = state_vector.reshape(state_vector.shape[:-1])
|
||||
mask = state_vector >= 0.
|
||||
state_vector *= mask
|
||||
else:
|
||||
state_vector, mask = self.ops.maxout(state_vector)
|
||||
|
||||
def backprop_nonlinearity(d_best, sgd=None):
|
||||
if self.nP == 1:
|
||||
d_best *= mask
|
||||
d_best = d_best.reshape((d_best.shape + (1,)))
|
||||
return d_best
|
||||
else:
|
||||
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
||||
return state_vector, backprop_nonlinearity
|
||||
|
|
@ -6,6 +6,7 @@ from ..vocab cimport Vocab
|
|||
from ..tokens.doc cimport Doc
|
||||
from ..structs cimport TokenC
|
||||
from ._state cimport StateC
|
||||
from ._parser_model cimport WeightsC, ActivationsC, SizesC
|
||||
|
||||
|
||||
cdef class Parser:
|
||||
|
@ -14,8 +15,10 @@ cdef class Parser:
|
|||
cdef readonly TransitionSystem moves
|
||||
cdef readonly object cfg
|
||||
cdef public object _multitasks
|
||||
|
||||
cdef void _parseC(self, StateC** states, int nr_task,
|
||||
const float* feat_weights, const float* bias,
|
||||
const float* hW, const float* hb,
|
||||
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
|
||||
|
||||
cdef void _parseC(self, StateC** states,
|
||||
WeightsC weights, SizesC sizes) nogil
|
||||
|
||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -5,9 +5,12 @@ from __future__ import unicode_literals
|
|||
from cpython.ref cimport Py_INCREF
|
||||
from cymem.cymem cimport Pool
|
||||
from thinc.typedefs cimport weight_t
|
||||
from thinc.extra.search cimport Beam
|
||||
from collections import OrderedDict, Counter
|
||||
import ujson
|
||||
|
||||
from . cimport _beam_utils
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..structs cimport TokenC
|
||||
from .stateclass cimport StateClass
|
||||
from ..typedefs cimport attr_t
|
||||
|
@ -57,6 +60,21 @@ cdef class TransitionSystem:
|
|||
offset += len(doc)
|
||||
return states
|
||||
|
||||
def init_beams(self, docs, beam_width, beam_density=0.):
|
||||
cdef Doc doc
|
||||
beams = []
|
||||
cdef int offset = 0
|
||||
for doc in docs:
|
||||
beam = Beam(self.n_moves, beam_width, min_density=beam_density)
|
||||
beam.initialize(self.init_beam_state, doc.length, doc.c)
|
||||
for i in range(beam.width):
|
||||
state = <StateC*>beam.at(i)
|
||||
state.offset = offset
|
||||
offset += len(doc)
|
||||
beam.check_done(_beam_utils.check_final_state, NULL)
|
||||
beams.append(beam)
|
||||
return beams
|
||||
|
||||
def get_oracle_sequence(self, doc, GoldParse gold):
|
||||
cdef Pool mem = Pool()
|
||||
costs = <float*>mem.alloc(self.n_moves, sizeof(float))
|
||||
|
|
|
@ -35,8 +35,7 @@ def parser(vocab, arc_eager):
|
|||
|
||||
@pytest.fixture
|
||||
def model(arc_eager, tok2vec):
|
||||
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO,
|
||||
hist_size=0)[0]
|
||||
return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0]
|
||||
|
||||
@pytest.fixture
|
||||
def doc(vocab):
|
||||
|
@ -69,11 +68,13 @@ def test_update_doc(parser, model, doc, gold):
|
|||
parser.update([doc], [gold], sgd=optimize)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_predict_doc_beam(parser, model, doc):
|
||||
parser.model = model
|
||||
parser(doc, beam_width=32, beam_density=0.001)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_update_doc_beam(parser, model, doc, gold):
|
||||
parser.model = model
|
||||
def optimize(weights, gradient, key=None):
|
||||
|
|
|
@ -34,6 +34,7 @@ def test_util_get_package_path(package):
|
|||
assert isinstance(path, Path)
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_displacy_parse_ents(en_vocab):
|
||||
"""Test that named entities on a Doc are converted into displaCy's format."""
|
||||
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
||||
|
@ -44,6 +45,7 @@ def test_displacy_parse_ents(en_vocab):
|
|||
assert ents['ents'] == [{'start': 4, 'end': 10, 'label': 'ORG'}]
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_displacy_parse_deps(en_vocab):
|
||||
"""Test that deps and tags on a Doc are converted into displaCy's format."""
|
||||
words = ["This", "is", "a", "sentence"]
|
||||
|
@ -64,6 +66,7 @@ def test_displacy_parse_deps(en_vocab):
|
|||
{'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
||||
model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
|
||||
assert model.W.shape == (nF, nO, nP, nI)
|
||||
|
|
Loading…
Reference in New Issue
Block a user