Merge branch 'develop' into feature/phrasematcher

This commit is contained in:
Matthew Honnibal 2017-09-26 09:59:17 -05:00
commit 41cc5c4c17
21 changed files with 287 additions and 196 deletions

View File

@ -24,7 +24,6 @@ install:
- "%PYTHON%\\python.exe -m pip install wheel" - "%PYTHON%\\python.exe -m pip install wheel"
- "%PYTHON%\\python.exe -m pip install cython" - "%PYTHON%\\python.exe -m pip install cython"
- "%PYTHON%\\python.exe -m pip install -r requirements.txt" - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
- "%PYTHON%\\python.exe setup.py build_ext --inplace"
- "%PYTHON%\\python.exe -m pip install -e ." - "%PYTHON%\\python.exe -m pip install -e ."
build: off build: off

View File

@ -16,3 +16,4 @@ pytest>=3.0.6,<4.0.0
mock>=2.0.0,<3.0.0 mock>=2.0.0,<3.0.0
msgpack-python msgpack-python
msgpack-numpy msgpack-numpy
html5lib==1.0b8

View File

@ -240,7 +240,6 @@ def link_vectors_to_models(vocab):
# (unideal, I know) # (unideal, I know)
thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
def Tok2Vec(width, embed_size, **kwargs): def Tok2Vec(width, embed_size, **kwargs):
pretrained_dims = kwargs.get('pretrained_dims', 0) pretrained_dims = kwargs.get('pretrained_dims', 0)
cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3) cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 3)
@ -271,7 +270,7 @@ def Tok2Vec(width, embed_size, **kwargs):
tok2vec = ( tok2vec = (
FeatureExtracter(cols) FeatureExtracter(cols)
>> with_flatten( >> with_flatten(
embed >> (convolution * 4), pad=4) embed >> (convolution ** 4), pad=4)
) )
# Work around thinc API limitations :(. TODO: Revise in Thinc 7 # Work around thinc API limitations :(. TODO: Revise in Thinc 7
@ -513,17 +512,17 @@ def build_tagger_model(nr_class, **cfg):
token_vector_width = util.env_opt('token_vector_width', 128) token_vector_width = util.env_opt('token_vector_width', 128)
pretrained_dims = cfg.get('pretrained_dims', 0) pretrained_dims = cfg.get('pretrained_dims', 0)
with Model.define_operators({'>>': chain, '+': add}): with Model.define_operators({'>>': chain, '+': add}):
# Input: (doc, tensor) tuples if 'tok2vec' in cfg:
private_tok2vec = Tok2Vec(token_vector_width, embed_size, tok2vec = cfg['tok2vec']
else:
tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=pretrained_dims) pretrained_dims=pretrained_dims)
model = ( model = (
fine_tune(private_tok2vec) tok2vec
>> with_flatten( >> with_flatten(Softmax(nr_class, token_vector_width))
Maxout(token_vector_width, token_vector_width)
>> Softmax(nr_class, token_vector_width)
)
) )
model.nI = None model.nI = None
model.tok2vec = tok2vec
return model return model

View File

@ -3,12 +3,13 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy-nightly' __title__ = 'spacy-nightly'
__version__ = '2.0.0a14' __version__ = '2.0.0a15'
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
__uri__ = 'https://spacy.io' __uri__ = 'https://spacy.io'
__author__ = 'Explosion AI' __author__ = 'Explosion AI'
__email__ = 'contact@explosion.ai' __email__ = 'contact@explosion.ai'
__license__ = 'MIT' __license__ = 'MIT'
__release__ = False
__docs_models__ = 'https://spacy.io/docs/usage/models' __docs_models__ = 'https://spacy.io/docs/usage/models'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download' __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'

View File

@ -11,6 +11,8 @@ import tqdm
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
from thinc.neural.optimizers import linear_decay from thinc.neural.optimizers import linear_decay
from timeit import default_timer as timer from timeit import default_timer as timer
import random
import numpy.random
from ..tokens.doc import Doc from ..tokens.doc import Doc
from ..scorer import Scorer from ..scorer import Scorer
@ -22,6 +24,9 @@ from .. import about
from .. import displacy from .. import displacy
from ..compat import json_dumps from ..compat import json_dumps
random.seed(0)
numpy.random.seed(0)
@plac.annotations( @plac.annotations(
lang=("model language", "positional", None, str), lang=("model language", "positional", None, str),
@ -63,7 +68,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
prints("Expected dict but got: {}".format(type(meta)), prints("Expected dict but got: {}".format(type(meta)),
title="Not a valid meta.json format", exits=1) title="Not a valid meta.json format", exits=1)
pipeline = ['token_vectors', 'tags', 'dependencies', 'entities'] pipeline = ['tags', 'dependencies', 'entities']
if no_tagger and 'tags' in pipeline: pipeline.remove('tags') if no_tagger and 'tags' in pipeline: pipeline.remove('tags')
if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies') if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies')
if no_entities and 'entities' in pipeline: pipeline.remove('entities') if no_entities and 'entities' in pipeline: pipeline.remove('entities')
@ -99,8 +104,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
for batch in minibatch(train_docs, size=batch_sizes): for batch in minibatch(train_docs, size=batch_sizes):
docs, golds = zip(*batch) docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses, drop=next(dropout_rates), losses=losses)
update_shared=True)
pbar.update(sum(len(doc) for doc in docs)) pbar.update(sum(len(doc) for doc in docs))
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
@ -109,10 +113,13 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
nlp.to_disk(epoch_model_path) nlp.to_disk(epoch_model_path)
nlp_loaded = lang_class(pipeline=pipeline) nlp_loaded = lang_class(pipeline=pipeline)
nlp_loaded = nlp_loaded.from_disk(epoch_model_path) nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
scorer = nlp.evaluate( scorer = nlp_loaded.evaluate(
corpus.dev_docs( list(corpus.dev_docs(
nlp, nlp_loaded,
gold_preproc=gold_preproc)) gold_preproc=gold_preproc)))
acc_loc =(output_path / ('model%d' % i) / 'accuracy.json')
with acc_loc.open('w') as file_:
file_.write(json_dumps(scorer.scores))
meta_loc = output_path / ('model%d' % i) / 'meta.json' meta_loc = output_path / ('model%d' % i) / 'meta.json'
meta['accuracy'] = scorer.scores meta['accuracy'] = scorer.scores
meta['lang'] = nlp.lang meta['lang'] = nlp.lang

View File

@ -29,9 +29,9 @@ _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb ' 'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
'TB T G M K %') 'TB T G M K %')
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$' _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$'
_punct = r', : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &' _punct = r'…… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & ·'
_quotes = r'\' \'\' " ” “ `` ` ´ , „ » «' _quotes = r'\' \'\' " ” “ `` ` ´ , „ » «'
_hyphens = '- — -- ---' _hyphens = '- — -- --- —— ~'
_other_symbols = r'[\p{So}]' _other_symbols = r'[\p{So}]'
UNITS = merge_chars(_units) UNITS = merge_chars(_units)

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .lemmatizer import LOOKUP from .lemmatizer import LOOKUP
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
@ -17,6 +18,7 @@ from ...util import update_exc, add_lookups
class FrenchDefaults(Language.Defaults): class FrenchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'fr' lex_attr_getters[LANG] = lambda text: 'fr'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)

View File

@ -0,0 +1,41 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = set("""
zero un deux trois quatre cinq six sept huit neuf dix
onze douze treize quatorze quinze seize dix-sept dix-huit dix-neuf
vingt trente quanrante cinquante soixante septante quatre-vingt huitante nonante
cent mille mil million milliard billion quadrillion quintillion
sextillion septillion octillion nonillion decillion
""".split())
_ordinal_words = set("""
premier deuxième second troisième quatrième cinquième sixième septième huitième neuvième dixième
onzième douzième treizième quatorzième quinzième seizième dix-septième dix-huitième dix-neufième
vingtième trentième quanrantième cinquantième soixantième septantième quatre-vingtième huitantième nonantième
centième millième millionnième milliardième billionnième quadrillionnième quintillionnième
sextillionnième septillionnième octillionnième nonillionnième decillionnième
""".split())
def like_num(text):
# Might require more work?
# See this discussion: https://github.com/explosion/spaCy/pull/1161
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
@ -12,6 +13,7 @@ from ...util import update_exc, add_lookups
class DutchDefaults(Language.Defaults): class DutchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'nl' lex_attr_getters[LANG] = lambda text: 'nl'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)

View File

@ -0,0 +1,40 @@
# coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = set("""
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
duizend miljoen miljard biljoen biljard triljoen triljard
""".split())
_ordinal_words = set("""
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
miljardste biljoenste biljardste triljoenste triljardste
""".split())
def like_num(text):
# This only does the most basic check for whether a token is a digit
# or matches one of the number words. In order to handle numbers like
# "drieëntwintig", more work is required.
# See this discussion: https://github.com/explosion/spaCy/pull/1177
text = text.replace(',', '').replace('.', '')
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
return True
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}

View File

@ -34,6 +34,7 @@ from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS from .lang.lex_attrs import LEX_ATTRS
from . import util from . import util
from .scorer import Scorer from .scorer import Scorer
from ._ml import link_vectors_to_models
class BaseDefaults(object): class BaseDefaults(object):
@ -278,8 +279,7 @@ class Language(object):
def make_doc(self, text): def make_doc(self, text):
return self.tokenizer(text) return self.tokenizer(text)
def update(self, docs, golds, drop=0., sgd=None, losses=None, def update(self, docs, golds, drop=0., sgd=None, losses=None):
update_shared=False):
"""Update the models in the pipeline. """Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects. docs (iterable): A batch of `Doc` objects.
@ -303,31 +303,17 @@ class Language(object):
if self._optimizer is None: if self._optimizer is None:
self._optimizer = Adam(Model.ops, 0.001) self._optimizer = Adam(Model.ops, 0.001)
sgd = self._optimizer sgd = self._optimizer
tok2vec = self.pipeline[0]
grads = {} grads = {}
def get_grads(W, dW, key=None): def get_grads(W, dW, key=None):
grads[key] = (W, dW) grads[key] = (W, dW)
pipes = list(self.pipeline[1:]) pipes = list(self.pipeline)
random.shuffle(pipes) random.shuffle(pipes)
tokvecses, bp_tokvecses = tok2vec.model.begin_update(docs, drop=drop)
all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses]
for proc in pipes: for proc in pipes:
if not hasattr(proc, 'update'): if not hasattr(proc, 'update'):
continue continue
d_tokvecses = proc.update((docs, tokvecses), golds, proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
drop=drop, sgd=get_grads, losses=losses)
if update_shared and d_tokvecses is not None:
for i, d_tv in enumerate(d_tokvecses):
all_d_tokvecses[i] += d_tv
if update_shared and bp_tokvecses is not None:
bp_tokvecses(all_d_tokvecses, sgd=sgd)
for key, (W, dW) in grads.items(): for key, (W, dW) in grads.items():
sgd(W, dW, key=key) sgd(W, dW, key=key)
# Clear the tensor variable, to free GPU memory.
# If we don't do this, the memory leak gets pretty
# bad, because we may be holding part of a batch.
for doc in docs:
doc.tensor = None
def preprocess_gold(self, docs_golds): def preprocess_gold(self, docs_golds):
"""Can be called before training to pre-process gold data. By default, """Can be called before training to pre-process gold data. By default,
@ -370,8 +356,6 @@ class Language(object):
**cfg: Config parameters. **cfg: Config parameters.
returns: An optimizer returns: An optimizer
""" """
if self.parser:
self.pipeline.append(NeuralLabeller(self.vocab))
# Populate vocab # Populate vocab
if get_gold_tuples is not None: if get_gold_tuples is not None:
for _, annots_brackets in get_gold_tuples(): for _, annots_brackets in get_gold_tuples():
@ -386,6 +370,7 @@ class Language(object):
self.vocab.vectors.data) self.vocab.vectors.data)
else: else:
device = None device = None
link_vectors_to_models(self.vocab)
for proc in self.pipeline: for proc in self.pipeline:
if hasattr(proc, 'begin_training'): if hasattr(proc, 'begin_training'):
context = proc.begin_training(get_gold_tuples(), context = proc.begin_training(get_gold_tuples(),
@ -417,7 +402,6 @@ class Language(object):
assert len(docs) == len(golds) assert len(docs) == len(golds)
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
scorer.score(doc, gold) scorer.score(doc, gold)
doc.tensor = None
return scorer return scorer
@contextmanager @contextmanager
@ -506,7 +490,6 @@ class Language(object):
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
serializers = OrderedDict(( serializers = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)),
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
)) ))
@ -518,6 +501,7 @@ class Language(object):
if not hasattr(proc, 'to_disk'): if not hasattr(proc, 'to_disk'):
continue continue
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
util.to_disk(path, serializers, {p: False for p in disable}) util.to_disk(path, serializers, {p: False for p in disable})
def from_disk(self, path, disable=tuple()): def from_disk(self, path, disable=tuple()):

View File

@ -146,6 +146,8 @@ cdef class Morphology:
self.add_special_case(tag_str, form_str, attrs) self.add_special_case(tag_str, form_str, attrs)
def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
if orth not in self.strings:
return orth
cdef unicode py_string = self.strings[orth] cdef unicode py_string = self.strings[orth]
if self.lemmatizer is None: if self.lemmatizer is None:
return self.strings.add(py_string.lower()) return self.strings.add(py_string.lower())

View File

@ -174,7 +174,7 @@ class BaseThincComponent(object):
deserialize = OrderedDict(( deserialize = OrderedDict((
('cfg', lambda b: self.cfg.update(ujson.loads(b))), ('cfg', lambda b: self.cfg.update(ujson.loads(b))),
('vocab', lambda b: self.vocab.from_bytes(b)) ('vocab', lambda b: self.vocab.from_bytes(b)),
('model', load_model), ('model', load_model),
)) ))
util.from_bytes(bytes_data, deserialize, exclude) util.from_bytes(bytes_data, deserialize, exclude)
@ -335,27 +335,25 @@ class NeuralTagger(BaseThincComponent):
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
def __call__(self, doc): def __call__(self, doc):
tags = self.predict(([doc], [doc.tensor])) tags = self.predict([doc])
self.set_annotations([doc], tags) self.set_annotations([doc], tags)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs) docs = list(docs)
tokvecs = [d.tensor for d in docs] tag_ids = self.predict(docs)
tag_ids = self.predict((docs, tokvecs))
self.set_annotations(docs, tag_ids) self.set_annotations(docs, tag_ids)
yield from docs yield from docs
def predict(self, docs_tokvecs): def predict(self, docs):
scores = self.model(docs_tokvecs) scores = self.model(docs)
scores = self.model.ops.flatten(scores) scores = self.model.ops.flatten(scores)
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
if not isinstance(guesses, numpy.ndarray): if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get() guesses = guesses.get()
tokvecs = docs_tokvecs[1]
guesses = self.model.ops.unflatten(guesses, guesses = self.model.ops.unflatten(guesses,
[tv.shape[0] for tv in tokvecs]) [len(d) for d in docs])
return guesses return guesses
def set_annotations(self, docs, batch_tag_ids): def set_annotations(self, docs, batch_tag_ids):
@ -375,20 +373,16 @@ class NeuralTagger(BaseThincComponent):
idx += 1 idx += 1
doc.is_tagged = True doc.is_tagged = True
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
if self.model.nI is None: tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
self.model.nI = tokvecs[0].shape[1]
tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
bp_tag_scores(d_tag_scores, sgd=sgd)
d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
if losses is not None: if losses is not None:
losses[self.name] += loss losses[self.name] += loss
return d_tokvecs
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores) scores = self.model.ops.flatten(scores)
@ -514,9 +508,25 @@ class NeuralTagger(BaseThincComponent):
class NeuralLabeller(NeuralTagger): class NeuralLabeller(NeuralTagger):
name = 'nn_labeller' name = 'nn_labeller'
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
if target == 'dep':
self.make_label = self.make_dep
elif target == 'tag':
self.make_label = self.make_tag
elif target == 'ent':
self.make_label = self.make_ent
elif target == 'dep_tag_offset':
self.make_label = self.make_dep_tag_offset
elif target == 'ent_tag':
self.make_label = self.make_ent_tag
elif hasattr(target, '__call__'):
self.make_label = target
else:
raise ValueError(
"NeuralLabeller target should be function or one of "
"['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2) self.cfg.setdefault('cnn_maxout_pieces', 2)
self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1]) self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
@ -532,43 +542,78 @@ class NeuralLabeller(NeuralTagger):
def set_annotations(self, docs, dep_ids): def set_annotations(self, docs, dep_ids):
pass pass
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
gold_tuples = nonproj.preprocess_training_data(gold_tuples) gold_tuples = nonproj.preprocess_training_data(gold_tuples)
for raw_text, annots_brackets in gold_tuples: for raw_text, annots_brackets in gold_tuples:
for annots, brackets in annots_brackets: for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots ids, words, tags, heads, deps, ents = annots
for dep in deps: for i in range(len(ids)):
if dep not in self.labels: label = self.make_label(i, words, tags, heads, deps, ents)
self.labels[dep] = len(self.labels) if label is not None and label not in self.labels:
token_vector_width = pipeline[0].model.nO self.labels[label] = len(self.labels)
print(len(self.labels))
if self.model is True: if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1] self.model = chain(
self.model = self.Model(len(self.labels), **self.cfg) tok2vec,
Softmax(len(self.labels), 128)
)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
@classmethod @classmethod
def Model(cls, n_tags, **cfg): def Model(cls, n_tags, tok2vec=None, **cfg):
return build_tagger_model(n_tags, **cfg) return build_tagger_model(n_tags, tok2vec=tok2vec, **cfg)
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
scores = self.model.ops.flatten(scores)
cdef int idx = 0 cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i') correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
for gold in golds: for gold in golds:
for tag in gold.labels: for i in range(len(gold.labels)):
if tag is None or tag not in self.labels: label = self.make_label(i, gold.words, gold.tags, gold.heads,
gold.labels, gold.ents)
if label is None or label not in self.labels:
correct[idx] = guesses[idx] correct[idx] = guesses[idx]
else: else:
correct[idx] = self.labels[tag] correct[idx] = self.labels[label]
idx += 1 idx += 1
correct = self.model.ops.xp.array(correct, dtype='i') correct = self.model.ops.xp.array(correct, dtype='i')
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
d_scores /= d_scores.shape[0] d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum() loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
@staticmethod
def make_dep(i, words, tags, heads, deps, ents):
if deps[i] is None or heads[i] is None:
return None
return deps[i]
@staticmethod
def make_tag(i, words, tags, heads, deps, ents):
return tags[i]
@staticmethod
def make_ent(i, words, tags, heads, deps, ents):
if ents is None:
return None
return ents[i]
@staticmethod
def make_dep_tag_offset(i, words, tags, heads, deps, ents):
if deps[i] is None or heads[i] is None:
return None
offset = heads[i] - i
offset = min(offset, 2)
offset = max(offset, -2)
return '%s-%s:%d' % (deps[i], tags[i], offset)
@staticmethod
def make_ent_tag(i, words, tags, heads, deps, ents):
if ents is None or ents[i] is None:
return None
else:
return '%s-%s' % (tags[i], ents[i])
class SimilarityHook(BaseThincComponent): class SimilarityHook(BaseThincComponent):
""" """
@ -605,15 +650,10 @@ class SimilarityHook(BaseThincComponent):
yield self(doc) yield self(doc)
def predict(self, doc1, doc2): def predict(self, doc1, doc2):
return self.model.predict([(doc1.tensor, doc2.tensor)]) return self.model.predict([(doc1, doc2)])
def update(self, doc1_tensor1_doc2_tensor2, golds, sgd=None, drop=0.): def update(self, doc1_doc2, golds, sgd=None, drop=0.):
doc1s, tensor1s, doc2s, tensor2s = doc1_tensor1_doc2_tensor2 sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
sims, bp_sims = self.model.begin_update(zip(tensor1s, tensor2s),
drop=drop)
d_tensor1s, d_tensor2s = bp_sims(golds, sgd=sgd)
return d_tensor1s, d_tensor2s
def begin_training(self, _=tuple(), pipeline=None): def begin_training(self, _=tuple(), pipeline=None):
""" """
@ -669,15 +709,13 @@ class TextCategorizer(BaseThincComponent):
for j, label in enumerate(self.labels): for j, label in enumerate(self.labels):
doc.cats[label] = float(scores[i, j]) doc.cats[label] = float(scores[i, j])
def update(self, docs_tensors, golds, state=None, drop=0., sgd=None, losses=None): def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
docs, tensors = docs_tensors
scores, bp_scores = self.model.begin_update(docs, drop=drop) scores, bp_scores = self.model.begin_update(docs, drop=drop)
loss, d_scores = self.get_loss(docs, golds, scores) loss, d_scores = self.get_loss(docs, golds, scores)
d_tensors = bp_scores(d_scores, sgd=sgd) bp_scores(d_scores, sgd=sgd)
if losses is not None: if losses is not None:
losses.setdefault(self.name, 0.0) losses.setdefault(self.name, 0.0)
losses[self.name] += loss losses[self.name] += loss
return d_tensors
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
@ -739,6 +777,14 @@ cdef class NeuralDependencyParser(NeuralParser):
name = 'parser' name = 'parser'
TransitionSystem = ArcEager TransitionSystem = ArcEager
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
for target in ['dep', 'ent']:
labeller = NeuralLabeller(self.vocab, target=target)
tok2vec = self.model[0]
labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
pipeline.append(labeller)
self._multitasks.append(labeller)
def __reduce__(self): def __reduce__(self):
return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None) return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
@ -749,13 +795,13 @@ cdef class NeuralEntityRecognizer(NeuralParser):
nr_feature = 6 nr_feature = 6
def predict_confidences(self, docs): def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
tensors = [d.tensor for d in docs] for target in []:
samples = [] labeller = NeuralLabeller(self.vocab, target=target)
for i in range(10): tok2vec = self.model[0]
states = self.parse_batch(docs, tensors, drop=0.3) labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
for state in states: pipeline.append(labeller)
samples.append(self._get_entities(state)) self._multitasks.append(labeller)
def __reduce__(self): def __reduce__(self):
return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None) return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)

View File

@ -147,10 +147,10 @@ def get_token_ids(states, int n_tokens):
nr_update = 0 nr_update = 0
def update_beam(TransitionSystem moves, int nr_feature, int max_steps, def update_beam(TransitionSystem moves, int nr_feature, int max_steps,
states, tokvecs, golds, states, golds,
state2vec, vec2scores, state2vec, vec2scores,
int width, float density, int width, float density,
sgd=None, losses=None, drop=0.): losses=None, drop=0.):
global nr_update global nr_update
cdef MaxViolation violn cdef MaxViolation violn
nr_update += 1 nr_update += 1

View File

@ -13,6 +13,7 @@ cdef class Parser:
cdef public object model cdef public object model
cdef readonly TransitionSystem moves cdef readonly TransitionSystem moves
cdef readonly object cfg cdef readonly object cfg
cdef public object _multitasks
cdef void _parse_step(self, StateC* state, cdef void _parse_step(self, StateC* state,
const float* feat_weights, const float* feat_weights,

View File

@ -7,6 +7,7 @@ from __future__ import unicode_literals, print_function
from collections import Counter, OrderedDict from collections import Counter, OrderedDict
import ujson import ujson
import json
import contextlib import contextlib
from libc.math cimport exp from libc.math cimport exp
@ -48,7 +49,7 @@ from .. import util
from ..util import get_async, get_cuda_stream from ..util import get_async, get_cuda_stream
from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts
from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune
from .._ml import Residual, drop_layer from .._ml import Residual, drop_layer, flatten
from .._ml import link_vectors_to_models from .._ml import link_vectors_to_models
from ..compat import json_dumps from ..compat import json_dumps
@ -245,8 +246,9 @@ cdef class Parser:
hidden_width = util.env_opt('hidden_width', hidden_width) hidden_width = util.env_opt('hidden_width', hidden_width)
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2)
embed_size = util.env_opt('embed_size', 4000) embed_size = util.env_opt('embed_size', 4000)
tensors = fine_tune(Tok2Vec(token_vector_width, embed_size, tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=cfg.get('pretrained_dims'))) pretrained_dims=cfg.get('pretrained_dims', 0))
tok2vec = chain(tok2vec, flatten)
if parser_maxout_pieces == 1: if parser_maxout_pieces == 1:
lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature, nF=cls.nr_feature,
@ -278,7 +280,7 @@ cdef class Parser:
'hidden_width': hidden_width, 'hidden_width': hidden_width,
'maxout_pieces': parser_maxout_pieces 'maxout_pieces': parser_maxout_pieces
} }
return (tensors, lower, upper), cfg return (tok2vec, lower, upper), cfg
def __init__(self, Vocab vocab, moves=True, model=True, **cfg): def __init__(self, Vocab vocab, moves=True, model=True, **cfg):
""" """
@ -317,6 +319,7 @@ cdef class Parser:
for label in labels: for label in labels:
self.moves.add_action(action, label) self.moves.add_action(action, label)
self.model = model self.model = model
self._multitasks = []
def __reduce__(self): def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None) return (Parser, (self.vocab, self.moves, self.model), None, None)
@ -336,11 +339,11 @@ cdef class Parser:
beam_density = self.cfg.get('beam_density', 0.0) beam_density = self.cfg.get('beam_density', 0.0)
cdef Beam beam cdef Beam beam
if beam_width == 1: if beam_width == 1:
states = self.parse_batch([doc], [doc.tensor]) states = self.parse_batch([doc])
self.set_annotations([doc], states) self.set_annotations([doc], states)
return doc return doc
else: else:
beam = self.beam_parse([doc], [doc.tensor], beam = self.beam_parse([doc],
beam_width=beam_width, beam_density=beam_density)[0] beam_width=beam_width, beam_density=beam_density)[0]
output = self.moves.get_beam_annot(beam) output = self.moves.get_beam_annot(beam)
state = <StateClass>beam.at(0) state = <StateClass>beam.at(0)
@ -369,11 +372,11 @@ cdef class Parser:
cdef Beam beam cdef Beam beam
for docs in cytoolz.partition_all(batch_size, docs): for docs in cytoolz.partition_all(batch_size, docs):
docs = list(docs) docs = list(docs)
tokvecs = [doc.tensor for doc in docs]
if beam_width == 1: if beam_width == 1:
parse_states = self.parse_batch(docs, tokvecs) parse_states = self.parse_batch(docs)
beams = []
else: else:
beams = self.beam_parse(docs, tokvecs, beams = self.beam_parse(docs,
beam_width=beam_width, beam_density=beam_density) beam_width=beam_width, beam_density=beam_density)
parse_states = [] parse_states = []
for beam in beams: for beam in beams:
@ -381,7 +384,7 @@ cdef class Parser:
self.set_annotations(docs, parse_states) self.set_annotations(docs, parse_states)
yield from docs yield from docs
def parse_batch(self, docs, tokvecses): def parse_batch(self, docs):
cdef: cdef:
precompute_hiddens state2vec precompute_hiddens state2vec
StateClass state StateClass state
@ -392,21 +395,15 @@ cdef class Parser:
int nr_class, nr_feat, nr_piece, nr_dim, nr_state int nr_class, nr_feat, nr_piece, nr_dim, nr_state
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
if isinstance(tokvecses, np.ndarray):
tokvecses = [tokvecses]
if USE_FINE_TUNE: cuda_stream = get_cuda_stream()
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
else: 0.0)
tokvecs = self.model[0].ops.flatten(tokvecses)
nr_state = len(docs) nr_state = len(docs)
nr_class = self.moves.n_moves nr_class = self.moves.n_moves
nr_dim = tokvecs.shape[1] nr_dim = tokvecs.shape[1]
nr_feat = self.nr_feature nr_feat = self.nr_feature
cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(nr_state, tokvecs,
cuda_stream, 0.0)
nr_piece = state2vec.nP nr_piece = state2vec.nP
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
@ -422,21 +419,23 @@ cdef class Parser:
c_token_ids = <int*>token_ids.data c_token_ids = <int*>token_ids.data
c_is_valid = <int*>is_valid.data c_is_valid = <int*>is_valid.data
cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) cdef int has_hidden = not getattr(vec2scores, 'is_noop', False)
cdef int nr_step
while not next_step.empty(): while not next_step.empty():
nr_step = next_step.size()
if not has_hidden: if not has_hidden:
for i in range( for i in cython.parallel.prange(nr_step, num_threads=6,
next_step.size(), num_threads=6, nogil=True): nogil=True):
self._parse_step(next_step[i], self._parse_step(next_step[i],
feat_weights, nr_class, nr_feat, nr_piece) feat_weights, nr_class, nr_feat, nr_piece)
else: else:
for i in range(next_step.size()): for i in range(nr_step):
st = next_step[i] st = next_step[i]
st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat)
self.moves.set_valid(&c_is_valid[i*nr_class], st) self.moves.set_valid(&c_is_valid[i*nr_class], st)
vectors = state2vec(token_ids[:next_step.size()]) vectors = state2vec(token_ids[:next_step.size()])
scores = vec2scores(vectors) scores = vec2scores(vectors)
c_scores = <float*>scores.data c_scores = <float*>scores.data
for i in range(next_step.size()): for i in range(nr_step):
st = next_step[i] st = next_step[i]
guess = arg_max_if_valid( guess = arg_max_if_valid(
&c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class)
@ -449,19 +448,15 @@ cdef class Parser:
next_step.push_back(st) next_step.push_back(st)
return states return states
def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001): def beam_parse(self, docs, int beam_width=3, float beam_density=0.001):
cdef Beam beam cdef Beam beam
cdef np.ndarray scores cdef np.ndarray scores
cdef Doc doc cdef Doc doc
cdef int nr_class = self.moves.n_moves cdef int nr_class = self.moves.n_moves
cdef StateClass stcls, output cdef StateClass stcls, output
if USE_FINE_TUNE:
tokvecs = self.model[0].ops.flatten(self.model[0]((docs, tokvecses)))
else:
tokvecs = self.model[0].ops.flatten(tokvecses)
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
cuda_stream, 0.0) 0.0)
beams = [] beams = []
cdef int offset = 0 cdef int offset = 0
cdef int j = 0 cdef int j = 0
@ -521,29 +516,23 @@ cdef class Parser:
free(scores) free(scores)
free(token_ids) free(token_ids)
def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds): if not any(self.moves.has_gold(gold) for gold in golds):
return None return None
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5:
return self.update_beam(docs_tokvecs, golds, return self.update_beam(docs, golds,
self.cfg['beam_width'], self.cfg['beam_density'], self.cfg['beam_width'], self.cfg['beam_density'],
drop=drop, sgd=sgd, losses=losses) drop=drop, sgd=sgd, losses=losses)
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
docs, tokvec_lists = docs_tokvecs
if isinstance(docs, Doc) and isinstance(golds, GoldParse): if isinstance(docs, Doc) and isinstance(golds, GoldParse):
docs = [docs] docs = [docs]
golds = [golds] golds = [golds]
if USE_FINE_TUNE:
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs = self.model[0].ops.flatten(my_tokvecs)
else:
tokvecs = self.model[0].ops.flatten(docs_tokvecs[1])
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
states, golds, max_steps = self._init_gold_batch(docs, golds) states, golds, max_steps = self._init_gold_batch(docs, golds)
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream,
0.0) 0.0)
todo = [(s, g) for (s, g) in zip(states, golds) todo = [(s, g) for (s, g) in zip(states, golds)
if not s.is_final() and g is not None] if not s.is_final() and g is not None]
@ -588,13 +577,9 @@ cdef class Parser:
if n_steps >= max_steps: if n_steps >= max_steps:
break break
self._make_updates(d_tokvecs, self._make_updates(d_tokvecs,
backprops, sgd, cuda_stream) bp_tokvecs, backprops, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs])
if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def update_beam(self, docs_tokvecs, golds, width=None, density=None, def update_beam(self, docs, golds, width=None, density=None,
drop=0., sgd=None, losses=None): drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds): if not any(self.moves.has_gold(gold) for gold in golds):
return None return None
@ -606,26 +591,20 @@ cdef class Parser:
density = self.cfg.get('beam_density', 0.0) density = self.cfg.get('beam_density', 0.0)
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
docs, tokvecs = docs_tokvecs
lengths = [len(d) for d in docs] lengths = [len(d) for d in docs]
assert min(lengths) >= 1 assert min(lengths) >= 1
if USE_FINE_TUNE:
my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop)
tokvecs = self.model[0].ops.flatten(my_tokvecs)
else:
tokvecs = self.model[0].ops.flatten(tokvecs)
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
for gold in golds: for gold in golds:
self.moves.preprocess_gold(gold) self.moves.preprocess_gold(gold)
cuda_stream = get_cuda_stream() cuda_stream = get_cuda_stream()
state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) (tokvecs, bp_tokvecs), state2vec, vec2scores = self.get_batch_model(docs, cuda_stream, 0.0)
states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500,
states, tokvecs, golds, states, golds,
state2vec, vec2scores, state2vec, vec2scores,
width, density, width, density,
sgd=sgd, drop=drop, losses=losses) drop=drop, losses=losses)
backprop_lower = [] backprop_lower = []
cdef float batch_size = len(docs) cdef float batch_size = len(docs)
for i, d_scores in enumerate(states_d_scores): for i, d_scores in enumerate(states_d_scores):
@ -643,20 +622,7 @@ cdef class Parser:
else: else:
backprop_lower.append((ids, d_vector, bp_vectors)) backprop_lower.append((ids, d_vector, bp_vectors))
d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) d_tokvecs = self.model[0].ops.allocate(tokvecs.shape)
self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream)
d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths)
if USE_FINE_TUNE:
d_tokvecs = bp_my_tokvecs(d_tokvecs, sgd=sgd)
return d_tokvecs
def _pad_tokvecs(self, tokvecs):
# Add a vector for missing values at the start of tokvecs
xp = get_array_module(tokvecs)
pad = xp.zeros((1, tokvecs.shape[1]), dtype=tokvecs.dtype)
return xp.vstack((pad, tokvecs))
def _unpad_tokvecs(self, d_tokvecs):
return d_tokvecs[1:]
def _init_gold_batch(self, whole_docs, whole_golds): def _init_gold_batch(self, whole_docs, whole_golds):
"""Make a square batch, of length equal to the shortest doc. A long """Make a square batch, of length equal to the shortest doc. A long
@ -694,7 +660,7 @@ cdef class Parser:
max_moves = max(max_moves, len(oracle_actions)) max_moves = max(max_moves, len(oracle_actions))
return states, golds, max_moves return states, golds, max_moves
def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): def _make_updates(self, d_tokvecs, bp_tokvecs, backprops, sgd, cuda_stream=None):
# Tells CUDA to block, so our async copies complete. # Tells CUDA to block, so our async copies complete.
if cuda_stream is not None: if cuda_stream is not None:
cuda_stream.synchronize() cuda_stream.synchronize()
@ -705,6 +671,7 @@ cdef class Parser:
d_state_features *= mask.reshape(ids.shape + (1,)) d_state_features *= mask.reshape(ids.shape + (1,))
self.model[0].ops.scatter_add(d_tokvecs, ids * mask, self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
d_state_features) d_state_features)
bp_tokvecs(d_tokvecs, sgd=sgd)
@property @property
def move_names(self): def move_names(self):
@ -714,11 +681,12 @@ cdef class Parser:
names.append(name) names.append(name)
return names return names
def get_batch_model(self, batch_size, tokvecs, stream, dropout): def get_batch_model(self, docs, stream, dropout):
_, lower, upper = self.model tok2vec, lower, upper = self.model
state2vec = precompute_hiddens(batch_size, tokvecs, tokvecs, bp_tokvecs = tok2vec.begin_update(docs, drop=dropout)
state2vec = precompute_hiddens(len(docs), tokvecs,
lower, stream, drop=dropout) lower, stream, drop=dropout)
return state2vec, upper return (tokvecs, bp_tokvecs), state2vec, upper
nr_feature = 8 nr_feature = 8
@ -781,7 +749,7 @@ cdef class Parser:
# order, or the model goes out of synch # order, or the model goes out of synch
self.cfg.setdefault('extra_labels', []).append(label) self.cfg.setdefault('extra_labels', []).append(label)
def begin_training(self, gold_tuples, **cfg): def begin_training(self, gold_tuples, pipeline=None, **cfg):
if 'model' in cfg: if 'model' in cfg:
self.model = cfg['model'] self.model = cfg['model']
gold_tuples = nonproj.preprocess_training_data(gold_tuples) gold_tuples = nonproj.preprocess_training_data(gold_tuples)
@ -792,9 +760,20 @@ cdef class Parser:
if self.model is True: if self.model is True:
cfg['pretrained_dims'] = self.vocab.vectors_length cfg['pretrained_dims'] = self.vocab.vectors_length
self.model, cfg = self.Model(self.moves.n_moves, **cfg) self.model, cfg = self.Model(self.moves.n_moves, **cfg)
self.init_multitask_objectives(gold_tuples, pipeline, **cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
self.cfg.update(cfg) self.cfg.update(cfg)
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
'''Setup models for secondary objectives, to benefit from multi-task
learning. This method is intended to be overridden by subclasses.
For instance, the dependency parser can benefit from sharing
an input representation with a label prediction model. These auxiliary
models are discarded after training.
'''
pass
def preprocess_gold(self, docs_golds): def preprocess_gold(self, docs_golds):
for doc, gold in docs_golds: for doc, gold in docs_golds:
yield doc, gold yield doc, gold
@ -853,7 +832,7 @@ cdef class Parser:
('upper_model', lambda: self.model[2].to_bytes()), ('upper_model', lambda: self.model[2].to_bytes()),
('vocab', lambda: self.vocab.to_bytes()), ('vocab', lambda: self.vocab.to_bytes()),
('moves', lambda: self.moves.to_bytes(strings=False)), ('moves', lambda: self.moves.to_bytes(strings=False)),
('cfg', lambda: ujson.dumps(self.cfg)) ('cfg', lambda: json.dumps(self.cfg, indent=2, sort_keys=True))
)) ))
if 'model' in exclude: if 'model' in exclude:
exclude['tok2vec_model'] = True exclude['tok2vec_model'] = True
@ -866,7 +845,7 @@ cdef class Parser:
deserializers = OrderedDict(( deserializers = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)), ('vocab', lambda b: self.vocab.from_bytes(b)),
('moves', lambda b: self.moves.from_bytes(b, strings=False)), ('moves', lambda b: self.moves.from_bytes(b, strings=False)),
('cfg', lambda b: self.cfg.update(ujson.loads(b))), ('cfg', lambda b: self.cfg.update(json.loads(b))),
('tok2vec_model', lambda b: None), ('tok2vec_model', lambda b: None),
('lower_model', lambda b: None), ('lower_model', lambda b: None),
('upper_model', lambda b: None) ('upper_model', lambda b: None)

View File

@ -61,33 +61,22 @@ def test_predict_doc(parser, tok2vec, model, doc):
parser(doc) parser(doc)
def test_update_doc(parser, tok2vec, model, doc, gold): def test_update_doc(parser, model, doc, gold):
parser.model = model parser.model = model
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
d_tokvecs = parser.update(([doc], tokvecs), [gold])
assert d_tokvecs[0].shape == tokvecs[0].shape
def optimize(weights, gradient, key=None): def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient weights -= 0.001 * gradient
bp_tokvecs(d_tokvecs, sgd=optimize) parser.update([doc], [gold], sgd=optimize)
assert d_tokvecs[0].sum() == 0.
def test_predict_doc_beam(parser, tok2vec, model, doc): def test_predict_doc_beam(parser, model, doc):
doc.tensor = tok2vec([doc])[0]
parser.model = model parser.model = model
parser(doc, beam_width=32, beam_density=0.001) parser(doc, beam_width=32, beam_density=0.001)
for word in doc:
print(word.text, word.head, word.dep_)
def test_update_doc_beam(parser, tok2vec, model, doc, gold): def test_update_doc_beam(parser, model, doc, gold):
parser.model = model parser.model = model
tokvecs, bp_tokvecs = tok2vec.begin_update([doc])
d_tokvecs = parser.update_beam(([doc], tokvecs), [gold])
assert d_tokvecs[0].shape == tokvecs[0].shape
def optimize(weights, gradient, key=None): def optimize(weights, gradient, key=None):
weights -= 0.001 * gradient weights -= 0.001 * gradient
bp_tokvecs(d_tokvecs, sgd=optimize) parser.update_beam([doc], [gold], sgd=optimize)
assert d_tokvecs[0].sum() == 0.

View File

@ -11,7 +11,7 @@ import pytest
def taggers(en_vocab): def taggers(en_vocab):
tagger1 = Tagger(en_vocab) tagger1 = Tagger(en_vocab)
tagger2 = Tagger(en_vocab) tagger2 = Tagger(en_vocab)
tagger1.model = tagger1.Model(8, 8) tagger1.model = tagger1.Model(8)
tagger2.model = tagger1.model tagger2.model = tagger1.model
return (tagger1, tagger2) return (tagger1, tagger2)

View File

@ -54,7 +54,7 @@ cdef class Doc:
cdef public object noun_chunks_iterator cdef public object noun_chunks_iterator
cdef int push_back(self, LexemeOrToken lex_or_tok, bint trailing_space) except -1 cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
cpdef np.ndarray to_array(self, object features) cpdef np.ndarray to_array(self, object features)

View File

@ -324,7 +324,6 @@ cdef class Vocab:
self.lexemes_from_bytes(file_.read()) self.lexemes_from_bytes(file_.read())
if self.vectors is not None: if self.vectors is not None:
self.vectors.from_disk(path, exclude='strings.json') self.vectors.from_disk(path, exclude='strings.json')
link_vectors_to_models(self)
return self return self
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
@ -364,7 +363,6 @@ cdef class Vocab:
('vectors', lambda b: serialize_vectors(b)) ('vectors', lambda b: serialize_vectors(b))
)) ))
util.from_bytes(bytes_data, setters, exclude) util.from_bytes(bytes_data, setters, exclude)
link_vectors_to_models(self)
return self return self
def lexemes_to_bytes(self): def lexemes_to_bytes(self):

View File

@ -17,7 +17,7 @@ fi
if [ "${VIA}" == "compile" ]; then if [ "${VIA}" == "compile" ]; then
pip install -r requirements.txt pip install -r requirements.txt
python setup.py clean --all python setup.py build_ext --inplace
pip install -e . pip install -e .
fi fi