diff --git a/bin/parser/conll_train.py b/bin/parser/conll_train.py index 8075dcd8a..e55215585 100755 --- a/bin/parser/conll_train.py +++ b/bin/parser/conll_train.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function from __future__ import division from __future__ import unicode_literals @@ -9,6 +10,8 @@ import io import random import time import gzip +import re +import numpy import plac import cProfile @@ -20,23 +23,29 @@ from spacy.gold import GoldParse from spacy.syntax.util import Config from spacy.syntax.arc_eager import ArcEager -from spacy.syntax.parser import Parser +from spacy.syntax.parser import Parser, get_templates +from spacy.syntax.beam_parser import BeamParser from spacy.scorer import Scorer from spacy.tagger import Tagger +from spacy.syntax.nonproj import PseudoProjectivity +from spacy.syntax import _parse_features as pf # Last updated for spaCy v0.97 -def read_conll(file_): +def read_conll(file_, n=0): """Read a standard CoNLL/MALT-style format""" - sents = [] - for sent_str in file_.read().strip().split('\n\n'): + text = file_.read().strip() + sent_strs = re.split(r'\n\s*\n', text) + for sent_id, sent_str in enumerate(sent_strs): + if not sent_str.strip(): + continue ids = [] words = [] heads = [] labels = [] tags = [] - for i, line in enumerate(sent_str.split('\n')): + for i, line in enumerate(sent_str.strip().split('\n')): word, pos_string, head_idx, label = _parse_line(line) words.append(word) if head_idx < 0: @@ -45,10 +54,10 @@ def read_conll(file_): heads.append(head_idx) labels.append(label) tags.append(pos_string) - text = ' '.join(words) annot = (ids, words, tags, heads, labels, ['O'] * len(ids)) - sents.append((None, [(annot, [])])) - return sents + yield (None, [(annot, None)]) + if n and sent_id >= n: + break def _parse_line(line): @@ -68,21 +77,33 @@ def _parse_line(line): pos = pieces[4] head_idx = int(pieces[6])-1 label = pieces[7] - if head_idx == 0: + if head_idx < 0: label = 'ROOT' return word, pos, head_idx, label + +def print_words(strings, words, embeddings): + ids = {strings[word]: word for word in words} + vectors = {} + for key, values in embeddings[5]: + if key in ids: + vectors[strings[key]] = values + for word in words: + if word in vectors: + print(word, vectors[word]) + def score_model(scorer, nlp, raw_text, annot_tuples, verbose=False): tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) + nlp.tagger.tag_from_strings(tokens, annot_tuples[2]) nlp.parser(tokens) gold = GoldParse(tokens, annot_tuples, make_projective=False) scorer.score(tokens, gold, verbose=verbose, punct_labels=('--', 'p', 'punct')) -def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0, - gold_preproc=False, force_gold=False): +def train(Language, gold_tuples, model_dir, dev_loc, n_iter=15, feat_set=u'basic', + learn_rate=0.001, update_step='sgd_cm', + batch_norm=False, seed=0, gold_preproc=False, force_gold=False): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): @@ -92,66 +113,141 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0 os.mkdir(dep_model_dir) os.mkdir(pos_model_dir) - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=ArcEager.get_labels(gold_tuples)) + if feat_set != 'neural': + Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, + labels=ArcEager.get_labels(gold_tuples)) + + else: + feat_groups = [ + (pf.core_words, 8), + (pf.core_tags, 4), + (pf.core_labels, 4), + (pf.core_shapes, 4), + ([f[0] for f in pf.valencies], 2) + ] + slots = [] + vector_widths = [] + feat_set = [] + input_length = 0 + for i, (feat_group, width) in enumerate(feat_groups): + feat_set.extend((f,) for f in feat_group) + slots += [i] * len(feat_group) + vector_widths.append(width) + input_length += width * len(feat_group) + hidden_layers = [128] * 5 + rho = 1e-4 + Config.write(dep_model_dir, 'config', + model='neural', + seed=seed, + labels=ArcEager.get_labels(gold_tuples), + feat_set=feat_set, + vector_widths=vector_widths, + slots=slots, + hidden_layers=hidden_layers, + update_step=update_step, + batch_norm=batch_norm, + eta=learn_rate, + mu=0.9, + ensemble_size=1, + rho=rho) nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) + for word in nlp.vocab: + word.norm = word.orth + words = list(nlp.vocab) + top5k = numpy.ndarray(shape=(10000, len(word.vector)), dtype='float32') + norms = numpy.ndarray(shape=(10000,), dtype='float32') + for i in range(10000): + if i >= 400 and words[i].has_vector: + top5k[i] = words[i].vector + norms[i] = numpy.sqrt(sum(top5k[i] ** 2)) + else: + # Make these way off values, to make big distance. + top5k[i] = 100.0 + norms[i] = 100.0 + print("Setting vectors") + for word in words[10000:]: + if word.has_vector: + cosines = numpy.dot(top5k, word.vector) + cosines /= norms * numpy.sqrt(sum(word.vector ** 2)) + most_similar = words[numpy.argmax(cosines)] + word.norm = most_similar.norm + else: + word.norm = word.shape + + print(nlp.parser.model.widths) - print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") + print("Itn.\tP.Loss\tPruned\tTrain\tDev\tSize") + last_score = 0.0 + nr_trimmed = 0 + eg_seen = 0 + loss = 0 for itn in range(n_iter): - scorer = Scorer() - loss = 0 + random.shuffle(gold_tuples) for _, sents in gold_tuples: for annot_tuples, _ in sents: - if len(annot_tuples[1]) == 1: - continue - - score_model(scorer, nlp, None, annot_tuples, verbose=False) - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - gold = GoldParse(tokens, annot_tuples, make_projective=True) - if not gold.is_projective: - raise Exception( - "Non-projective sentence in training, after we should " - "have enforced projectivity: %s" % annot_tuples - ) - + nlp.tagger.tag_from_strings(tokens, annot_tuples[2]) + gold = GoldParse(tokens, annot_tuples) loss += nlp.parser.train(tokens, gold) - nlp.tagger.train(tokens, gold.tags) - random.shuffle(gold_tuples) - print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, - scorer.tags_acc, scorer.token_acc)) - print('end training') + eg_seen += 1 + if eg_seen % 10000 == 0: + scorer = Scorer() + with io.open(dev_loc, 'r', encoding='utf8') as file_: + for _, sents in read_conll(file_): + for annot_tuples, _ in sents: + score_model(scorer, nlp, None, annot_tuples) + train_scorer = Scorer() + for _, sents in gold_tuples[:1000]: + for annot_tuples, _ in sents: + score_model(train_scorer, nlp, None, annot_tuples) + print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%d' % (itn, int(loss), nr_trimmed, + train_scorer.uas, scorer.uas, + nlp.parser.model.mem.size)) + loss = 0 + if feat_set != 'basic': + nlp.parser.model.eta *= 0.99 + threshold = 0.05 * (1.05 ** itn) + nr_trimmed = nlp.parser.model.sparsify_embeddings(threshold, True) nlp.end_training(model_dir) - print('done') + return nlp @plac.annotations( train_loc=("Location of CoNLL 09 formatted training file"), dev_loc=("Location of CoNLL 09 formatted development file"), model_dir=("Location of output model directory"), - eval_only=("Skip training, and only evaluate", "flag", "e", bool), n_iter=("Number of training iterations", "option", "i", int), + batch_norm=("Use batch normalization and residual connections", "flag", "b"), + update_step=("Update step", "option", "u", str), + learn_rate=("Learn rate", "option", "e", float), + neural=("Use neural network?", "flag", "N") ) -def main(train_loc, dev_loc, model_dir, n_iter=15): +def main(train_loc, dev_loc, model_dir, n_iter=15, neural=False, batch_norm=False, + learn_rate=0.001, update_step='sgd_cm'): with io.open(train_loc, 'r', encoding='utf8') as file_: - train_sents = read_conll(file_) - if not eval_only: - train(English, train_sents, model_dir, n_iter=n_iter) - nlp = English(data_dir=model_dir) - dev_sents = read_conll(io.open(dev_loc, 'r', encoding='utf8')) + train_sents = list(read_conll(file_)) + # preprocess training data here before ArcEager.get_labels() is called + train_sents = PseudoProjectivity.preprocess_training_data(train_sents) + + nlp = train(English, train_sents, model_dir, dev_loc, n_iter=n_iter, + feat_set='neural' if neural else 'basic', + batch_norm=batch_norm, + learn_rate=learn_rate, + update_step=update_step) scorer = Scorer() - for _, sents in dev_sents: - for annot_tuples, _ in sents: - score_model(scorer, nlp, None, annot_tuples) - print('TOK', 100-scorer.token_acc) + with io.open(dev_loc, 'r', encoding='utf8') as file_: + for _, sents in read_conll(file_): + for annot_tuples, _ in sents: + score_model(scorer, nlp, None, annot_tuples) + print('TOK', scorer.token_acc) print('POS', scorer.tags_acc) print('UAS', scorer.uas) print('LAS', scorer.las) + if __name__ == '__main__': plac.call(main) diff --git a/bin/parser/train.py b/bin/parser/train.py index 372c7932e..0a86bf933 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -23,7 +23,8 @@ from spacy.scorer import Scorer from spacy.syntax.arc_eager import ArcEager from spacy.syntax.ner import BiluoPushDown from spacy.tagger import Tagger -from spacy.syntax.parser import Parser +from spacy.syntax.parser import Parser, get_templates +from spacy.syntax.beam_parser import BeamParser from spacy.syntax.nonproj import PseudoProjectivity @@ -103,6 +104,23 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, labels=ArcEager.get_labels(gold_tuples), beam_width=beam_width,projectivize=pseudoprojective) + #feat_set, slots = get_templates('neural') + #vector_widths = [10, 10, 10] + #hidden_layers = [100, 100, 100] + #update_step = 'adam' + #eta = 0.001 + #rho = 1e-4 + #Config.write(dep_model_dir, 'config', model='neural', + # seed=seed, labels=ArcEager.get_labels(gold_tuples), + # feat_set=feat_set, + # vector_widths=vector_widths, + # slots=slots, + # hidden_layers=hidden_layers, + # update_step=update_step, + # eta=eta, + # rho=rho) + + Config.write(ner_model_dir, 'config', features='ner', seed=seed, labels=BiluoPushDown.get_labels(gold_tuples), beam_width=0) @@ -112,8 +130,13 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) - nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) - nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown) + nlp.parser = BeamParser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) + nlp.entity = BeamParser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown) + print(nlp.parser.model.widths) + for raw_text, sents in gold_tuples: + for annot_tuples, ctnt in sents: + for word in annot_tuples[1]: + _ = nlp.vocab[word] print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") for itn in range(n_iter): scorer = Scorer() @@ -224,12 +247,13 @@ def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc= if not eval_only: gold_train = list(read_json_file(train_loc)) train(lang, gold_train, model_dir, - feat_set='basic' if not debug else 'debug', + feat_set='neural' if not debug else 'debug', gold_preproc=gold_preproc, n_sents=n_sents, corruption_level=corruption_level, n_iter=n_iter, verbose=verbose,pseudoprojective=pseudoprojective) if out_loc: write_parses(lang, dev_loc, model_dir, out_loc) + print(model_dir) scorer = evaluate(lang, list(read_json_file(dev_loc)), model_dir, gold_preproc=gold_preproc, verbose=verbose) print('TOK', scorer.token_acc) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index 213591804..108be6192 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -16,24 +16,86 @@ from spacy.syntax.arc_eager import ArcEager from spacy.syntax.parser import get_templates from spacy.scorer import Scorer import spacy.attrs +from spacy.syntax.nonproj import PseudoProjectivity + +from spacy.syntax._parse_features import * from spacy.language import Language -from spacy.tagger import W_orth - -TAGGER_TEMPLATES = ( - (W_orth,), -) - try: from codecs import open except ImportError: pass +features = [ + (S2W,), + (S1W, ), + (S1rW,), + (S0lW, ), + (S0l2W, ), + (S0W, ), + (S0r2W, ), + (S0rW, ), + (N0l2W, ), + (N0lW, ), + (N0W, ), + (N1W, ), + (N2W, ) +] + +slots = [0] * len(features) + +features += [ + (S2p,), + (S1p, ), + (S1rp,), + (S0lp,), + (S0l2p,), + (S0p, ), + (S0r2p, ), + (S0rp, ), + (N0l2p, ), + (N0lp, ), + (N0p, ), + (N1p, ), + (N2p, ) +] + +slots += [1] * (len(features) - len(slots)) + +features += [ + (S2L,), + (S1L,), + (S1rL,), + (S0lL,), + (S0l2L,), + (S0L,), + (S0rL,), + (S0r2L,), + (N0l2L,), + (N0lL,), +] +slots += [2] * (len(features) - len(slots)) +# +#features += [(S2p, S1p), (S1p, S0p)] +#slots += [3, 3] +#features += [(S0p, N0p)] +#slots += [4] +# (S0l2p, S0l2L, S0lp, S0l2L), +# (N0l2p, N0l2L, N0lp, N0lL), +# (S1p, S1rp, S1rL), +# (S0p, S0rp, S0rL), +#) + + + + class TreebankParser(object): @staticmethod - def setup_model_dir(model_dir, labels, templates, feat_set='basic', seed=0): + def setup_model_dir(model_dir, labels, vector_widths=(300,), slots=(0,), + hidden_layers=(300, 300), + feat_set='basic', seed=0, update_step='sgd', eta=0.005, rho=0.0): dep_model_dir = path.join(model_dir, 'deps') pos_model_dir = path.join(model_dir, 'pos') if path.exists(dep_model_dir): @@ -43,15 +105,16 @@ class TreebankParser(object): os.mkdir(dep_model_dir) os.mkdir(pos_model_dir) - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=labels) + Config.write(dep_model_dir, 'config', model='neural', feat_set=feat_set, + seed=seed, labels=labels, vector_widths=vector_widths, slots=slots, + hidden_layers=hidden_layers, update_step=update_step, eta=eta, rho=rho) @classmethod def from_dir(cls, tag_map, model_dir): - vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs()) + vocab = Vocab.load(model_dir, get_lex_attr=Language.default_lex_attrs()) vocab.get_lex_attr[spacy.attrs.LANG] = lambda _: 0 tokenizer = Tokenizer(vocab, {}, None, None, None) - tagger = Tagger.blank(vocab, TAGGER_TEMPLATES) + tagger = Tagger.blank(vocab, Tagger.default_templates()) cfg = Config.read(path.join(model_dir, 'deps'), 'config') parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager) @@ -64,22 +127,14 @@ class TreebankParser(object): self.parser = parser def train(self, words, tags, heads, deps): - tokens = self.tokenizer.tokens_from_list(list(words)) - self.tagger.train(tokens, tags) - tokens = self.tokenizer.tokens_from_list(list(words)) ids = range(len(words)) ner = ['O'] * len(words) - gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner)), - make_projective=False) - self.tagger(tokens) - if gold.is_projective: - try: - self.parser.train(tokens, gold) - except: - for id_, word, head, dep in zip(ids, words, heads, deps): - print(id_, word, head, dep) - raise + gold = GoldParse(tokens, ((ids, words, tags, heads, deps, ner))) + self.tagger.tag_from_strings(tokens, tags) + loss = self.parser.train(tokens, gold) + PseudoProjectivity.deprojectivize(tokens) + return loss def __call__(self, words, tags=None): tokens = self.tokenizer.tokens_from_list(list(words)) @@ -88,6 +143,7 @@ class TreebankParser(object): else: self.tagger.tag_from_strings(tokens, tags) self.parser(tokens) + PseudoProjectivity.deprojectivize(tokens) return tokens def end_training(self, data_dir): @@ -101,8 +157,6 @@ class TreebankParser(object): self.vocab.dump(path.join(data_dir, 'vocab', 'lexemes.bin')) - - def read_conllx(loc): with open(loc, 'r', 'utf8') as file_: text = file_.read() @@ -119,8 +173,8 @@ def read_conllx(loc): id_ = int(id_) - 1 head = (int(head) - 1) if head != '0' else id_ dep = 'ROOT' if dep == 'root' else dep - tokens.append((id_, word, tag, head, dep, 'O')) - tuples = zip(*tokens) + tokens.append([id_, word, tag, head, dep, 'O']) + tuples = [list(el) for el in zip(*tokens)] yield (None, [(tuples, [])]) @@ -134,27 +188,38 @@ def score_model(nlp, gold_docs, verbose=False): return scorer -def main(train_loc, dev_loc, model_dir, tag_map_loc): +@plac.annotations( + n_iter=("Number of training iterations", "option", "i", int), +) +def main(train_loc, dev_loc, model_dir, tag_map_loc, n_iter=10): with open(tag_map_loc) as file_: tag_map = json.loads(file_.read()) train_sents = list(read_conllx(train_loc)) - labels = ArcEager.get_labels(train_sents) - templates = get_templates('basic') + train_sents = PseudoProjectivity.preprocess_training_data(train_sents) + dev_sents = list(read_conllx(dev_loc)) - TreebankParser.setup_model_dir(model_dir, labels, templates) + labels = ArcEager.get_labels(train_sents) + + TreebankParser.setup_model_dir(model_dir, labels, + feat_set=features, vector_widths=(10,10,10,30,30), slots=slots, + hidden_layers=(100,100,100), update_step='adam') nlp = TreebankParser.from_dir(tag_map, model_dir) + nlp.parser.model.rho = 1e-4 + print(nlp.parser.model.widths) - for itn in range(15): + for itn in range(n_iter): + loss = 0.0 for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: - nlp.train(words, tags, heads, deps) + loss += nlp.train(words, tags, heads, deps) random.shuffle(train_sents) - scorer = score_model(nlp, read_conllx(dev_loc)) - print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc)) + scorer = score_model(nlp, dev_sents) + print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) + print(nlp.parser.model.mem.size) nlp.end_training(model_dir) scorer = score_model(nlp, read_conllx(dev_loc)) - print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) + print('Dev: %.3f\t%.3f\t%.3f' % (scorer.uas, scorer.las, scorer.tags_acc)) if __name__ == '__main__': diff --git a/setup.py b/setup.py index 2098fb377..3871432cc 100644 --- a/setup.py +++ b/setup.py @@ -51,6 +51,7 @@ MOD_NAMES = [ 'spacy.syntax._state', 'spacy.tokenizer', 'spacy.syntax.parser', + 'spacy.syntax.beam_parser', 'spacy.syntax.nonproj', 'spacy.syntax.transition_system', 'spacy.syntax.arc_eager', @@ -73,7 +74,8 @@ MOD_NAMES = [ compile_options = { 'msvc': ['/Ox', '/EHsc'], 'mingw32' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'], - 'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] + 'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function', + '-I/Users/matt/blis/include/blis'] } diff --git a/spacy/gold.pyx b/spacy/gold.pyx index c3badc60d..de5b129fd 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,3 +1,4 @@ +# cython: profile=True import numpy import io import json @@ -264,13 +265,3 @@ cdef class GoldParse: def is_punct_label(label): return label == 'P' or label.lower() == 'punct' - - - - - - - - - - diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index bc54e0c9d..4a17a0d61 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -35,8 +35,8 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: context[11] = 0 context[12] = 0 else: - context[0] = token.lex.orth - context[1] = token.lemma + context[0] = token.lex.norm + context[1] = token.lex.norm context[2] = token.tag context[3] = token.lex.cluster # We've read in the string little-endian, so now we can take & (2**n)-1 @@ -366,27 +366,26 @@ trigrams = ( words = ( - S2w, - S1w, - S1rw, - S0lw, - S0l2w, - S0w, - S0r2w, - S0rw, - N0lw, - N0l2w, - N0w, - N1w, - N2w, - P1w, - P2w + S2W, + S1W, + S1rW, + S0lW, + S0l2W, + S0W, + S0r2W, + S0rW, + N0lW, + N0l2W, + N0W, + N1W, + N2W, + P1W, + P2W ) tags = ( S2p, S1p, - S1rp, S0lp, S0l2p, S0p, @@ -404,7 +403,6 @@ tags = ( labels = ( S2L, S1L, - S1rL, S0lL, S0l2L, S0L, @@ -412,9 +410,88 @@ labels = ( S0rL, N0lL, N0l2L, - N0L, - N1L, - N2L, - P1L, - P2L ) + +core_words = ( + S2w, + S1w, + S0lw, + S0l2w, + S0w, + S0rw, + S0r2w, + N0lw, + N0l2w, + N0w, + N1w, + N2w, +) + + +core_shapes = ( + S2_shape, + S1_shape, + S0l_shape, + S0l2_shape, + S0_shape, + S0r_shape, + S0r2_shape, + N0l_shape, + N0l2_shape, + N0_shape, + N1_shape, + N2_shape, +) + + +core_clusters = ( + S2c, + S1c, + S0lc, + S0l2c, + S0c, + S0rc, + S0r2c, + N0lc, + N0l2c, + N0c, + N1c, + N2c, +) + + + +core_tags = ( + S2p, + S1p, + S0lp, + S0l2p, + S0p, + S0r2p, + S0rp, + N0lp, + N0l2p, + N0p, + N1p, + N2p, +) + +core_labels = ( + S2L, + S1L, + S0lL, + S0l2L, + S0L, + S0r2L, + S0rL, + N0lL, + N0l2L, +) + +valencies = ( + (N0lv,), + (S0lv,), + (S0rv,), + (S1lv,), + (S1rv,), +) diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index bf1dbf90f..ad8dc2ef2 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -1,6 +1,9 @@ from libc.string cimport memcpy, memset from libc.stdlib cimport malloc, calloc, free -from libc.stdint cimport uint32_t +from libc.stdint cimport uint32_t, uint64_t + +from murmurhash.mrmr cimport hash64 + from ..vocab cimport EMPTY_LEXEME from ..structs cimport TokenC, Entity from ..lexeme cimport Lexeme @@ -201,6 +204,21 @@ cdef cppclass StateC: else: return this.length - this._b_i + uint64_t hash() nogil const: + cdef TokenC[11] sig + sig[0] = this.S_(2)[0] + sig[1] = this.S_(1)[0] + sig[2] = this.R_(this.S(1), 1)[0] + sig[3] = this.L_(this.S(0), 1)[0] + sig[4] = this.L_(this.S(0), 2)[0] + sig[5] = this.S_(0)[0] + sig[6] = this.R_(this.S(0), 2)[0] + sig[7] = this.R_(this.S(0), 1)[0] + sig[8] = this.B_(0)[0] + sig[9] = this.E_(0)[0] + sig[10] = this.E_(1)[0] + return hash64(sig, sizeof(sig), this._s_i) + void push() nogil: if this.B(0) != -1: this._stack[this._s_i] = this.B(0) @@ -290,6 +308,8 @@ cdef cppclass StateC: memcpy(this._stack, src._stack, this.length * sizeof(int)) memcpy(this._buffer, src._buffer, this.length * sizeof(int)) memcpy(this._ents, src._ents, this.length * sizeof(Entity)) + memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0])) + this.length = src.length this._b_i = src._b_i this._s_i = src._s_i this._e_i = src._e_i diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 4e2590734..409676c55 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -436,4 +436,11 @@ cdef class ArcEager(TransitionSystem): else: is_valid[i] = False costs[i] = 9000 - assert n_gold >= 1 + if n_gold < 1: + for annot in gold.orig_annot: + print(annot) + print([move_costs[i] for i in range(N_MOVES)]) + print(gold.orig_annot[stcls.S(0)][1], gold.orig_annot[stcls.B(0)][1]) + print(gold.heads[stcls.S(0)], gold.heads[stcls.B(0)]) + print(gold.labels[stcls.S(0)], gold.labels[stcls.B(0)]) + raise Exception("No gold moves") diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index 9ceb0c0bf..cec0ea57f 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -10,7 +10,7 @@ def english_noun_chunks(doc): for i, word in enumerate(doc): if word.pos in (NOUN, PROPN, PRON) and word.dep in np_deps: yield word.left_edge.i, word.i+1, np_label - elif word.pos == NOUN and word.dep == conj: + elif word.pos in (NOUN, PROPN, PRON) and word.dep == conj: head = word.head while head.dep == conj and head.head.i < head.i: head = head.head diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index c22254c66..956c178a7 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -1,25 +1,37 @@ from thinc.linear.avgtron cimport AveragedPerceptron from thinc.neural.nn cimport NeuralNet +from thinc.linear.features cimport ConjunctionExtracter from thinc.base cimport Model from thinc.extra.eg cimport Example +from thinc.typedefs cimport weight_t +from thinc.structs cimport FeatureC from .stateclass cimport StateClass from .arc_eager cimport TransitionSystem from ..tokens.doc cimport Doc from ..structs cimport TokenC -from thinc.structs cimport ExampleC +from thinc.structs cimport NeuralNetC, ExampleC from ._state cimport StateC cdef class ParserNeuralNet(NeuralNet): - cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil + cdef ConjunctionExtracter extracter + cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil + cdef class ParserPerceptron(AveragedPerceptron): - cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil + cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil +cdef class ParserNeuralNetEnsemble(ParserNeuralNet): + cdef object _models + cdef NeuralNetC** _models_c + cdef int** _masks + cdef int _nr_model + + cdef class Parser: - cdef readonly ParserNeuralNet model + cdef readonly Model model cdef readonly TransitionSystem moves cdef int _projectivize diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index b83f7bc07..3b1d7a284 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -1,4 +1,5 @@ # cython: infer_types=True +# cython: profile=True """ MALT-style dependency parser """ @@ -18,13 +19,14 @@ import shutil import json import sys from .nonproj import PseudoProjectivity +import random from cymem.cymem cimport Pool, Address from murmurhash.mrmr cimport hash64 -from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t +from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t, idx_t from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linalg cimport VecVec -from thinc.structs cimport SparseArrayC, ExampleC +from thinc.structs cimport NeuralNetC, SparseArrayC, ExampleC from preshed.maps cimport MapStruct from preshed.maps cimport map_get from thinc.structs cimport FeatureC @@ -61,8 +63,10 @@ def get_templates(name): return pf.ner elif name == 'debug': return pf.unigrams - elif name.startswith('embed'): - return (pf.words, pf.tags, pf.labels) + elif name.startswith('neural'): + features = pf.words + pf.tags + pf.labels + slots = [0] * len(pf.words) + [1] * len(pf.tags) + [2] * len(pf.labels) + return ([(f,) for f in features], slots) else: return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \ pf.tree_shape + pf.trigrams) @@ -73,72 +77,238 @@ def ParserFactory(transition_system): cdef class ParserPerceptron(AveragedPerceptron): - cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: + @property + def widths(self): + return (self.extracter.nr_templ,) + + def update(self, Example eg): + '''Does regression on negative cost. Sort of cute?''' + self.time += 1 + cdef weight_t loss = 0.0 + best = eg.best + for clas in range(eg.c.nr_class): + if not eg.c.is_valid[clas]: + continue + if eg.c.scores[clas] < eg.c.scores[best]: + continue + loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2 + d_loss = 2 * (-eg.c.costs[clas] - eg.c.scores[clas]) + step = d_loss * 0.001 + for feat in eg.c.features[:eg.c.nr_feat]: + self.update_weight(feat.key, clas, feat.value * step) + return int(loss) + + cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil: + state = _state fill_context(eg.atoms, state) eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) cdef class ParserNeuralNet(NeuralNet): - def __init__(self, nr_class, hidden_width=50, depth=2, word_width=50, - tag_width=20, dep_width=20, update_step='sgd', eta=0.01, rho=0.0): - #input_length = 3 * word_width + 5 * tag_width + 3 * dep_width - input_length = 12 * word_width + 7 * dep_width - widths = [input_length] + [hidden_width] * depth + [nr_class] - #vector_widths = [word_width, tag_width, dep_width] - #slots = [0] * 3 + [1] * 5 + [2] * 3 - vector_widths = [word_width, dep_width] - slots = [0] * 12 + [1] * 7 - NeuralNet.__init__( - self, - widths, - embed=(vector_widths, slots), - eta=eta, - rho=rho, - update_step=update_step) + def __init__(self, shape, **kwargs): + vector_widths = [4] * 57 + slots = [0, 1, 2, 3] # S0 + slots += [4, 5, 6, 7] # S1 + slots += [8, 9, 10, 11] # S2 + slots += [12, 13, 14, 15] # S3+ + slots += [16, 17, 18, 19] # B0 + slots += [20, 21, 22, 23] # B1 + slots += [24, 25, 26, 27] # B2 + slots += [28, 29, 30, 31] # B3+ + slots += [32, 33, 34, 35] * 2 # S0l, S0r + slots += [36, 37, 38, 39] * 2 # B0l, B0r + slots += [40, 41, 42, 43] * 2 # S1l, S1r + slots += [44, 45, 46, 47] * 2 # S2l, S2r + slots += [48, 49, 50, 51, 52] + slots += [53, 54, 55, 56] + input_length = sum(vector_widths[slot] for slot in slots) + widths = [input_length] + shape[3:] + + NeuralNet.__init__(self, widths, embed=(vector_widths, slots), **kwargs) @property def nr_feat(self): - #return 3+5+3 - return 12+7 + return 2000 - cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil: + cdef void set_featuresC(self, ExampleC* eg, const void* _state) nogil: + memset(eg.features, 0, 2000 * sizeof(FeatureC)) + state = _state fill_context(eg.atoms, state) - eg.nr_feat = 12 + 7 - for j in range(eg.nr_feat): - eg.features[j].value = 1.0 - eg.features[j].i = j - #eg.features[0].key = eg.atoms[S0w] - #eg.features[1].key = eg.atoms[S1w] - #eg.features[2].key = eg.atoms[N0w] + feats = eg.features - eg.features[0].key = eg.atoms[S2W] - eg.features[1].key = eg.atoms[S1W] - eg.features[2].key = eg.atoms[S0lW] - eg.features[3].key = eg.atoms[S0l2W] - eg.features[4].key = eg.atoms[S0W] - eg.features[5].key = eg.atoms[S0r2W] - eg.features[6].key = eg.atoms[S0rW] - eg.features[7].key = eg.atoms[N0lW] - eg.features[8].key = eg.atoms[N0l2W] - eg.features[9].key = eg.atoms[N0W] - eg.features[10].key = eg.atoms[N1W] - eg.features[11].key = eg.atoms[N2W] + feats = _add_token(feats, 0, state.S_(0), 1.0) + feats = _add_token(feats, 4, state.S_(1), 1.0) + feats = _add_token(feats, 8, state.S_(2), 1.0) + # Rest of the stack, with exponential decay + for i in range(3, state.stack_depth()): + feats = _add_token(feats, 12, state.S_(i), 1.0 * 0.5**(i-2)) + feats = _add_token(feats, 16, state.B_(0), 1.0) + feats = _add_token(feats, 20, state.B_(1), 1.0) + feats = _add_token(feats, 24, state.B_(2), 1.0) + # Rest of the buffer, with exponential decay + for i in range(3, min(8, state.buffer_length())): + feats = _add_token(feats, 28, state.B_(i), 1.0 * 0.5**(i-2)) + feats = _add_subtree(feats, 32, state, state.S(0)) + feats = _add_subtree(feats, 40, state, state.B(0)) + feats = _add_subtree(feats, 48, state, state.S(1)) + feats = _add_subtree(feats, 56, state, state.S(2)) + feats = _add_pos_bigram(feats, 64, state.S_(0), state.B_(0)) + feats = _add_pos_bigram(feats, 65, state.S_(1), state.S_(0)) + feats = _add_pos_bigram(feats, 66, state.S_(1), state.B_(0)) + feats = _add_pos_bigram(feats, 67, state.S_(0), state.B_(1)) + feats = _add_pos_bigram(feats, 68, state.B_(0), state.B_(1)) + feats = _add_pos_trigram(feats, 69, state.S_(1), state.S_(0), state.B_(0)) + feats = _add_pos_trigram(feats, 70, state.S_(0), state.B_(0), state.B_(1)) + feats = _add_pos_trigram(feats, 71, state.S_(0), state.R_(state.S(0), 1), + state.R_(state.S(0), 2)) + feats = _add_pos_trigram(feats, 72, state.S_(0), state.L_(state.S(0), 1), + state.L_(state.S(0), 2)) + eg.nr_feat = feats - eg.features - eg.features[12].key = eg.atoms[S2L] - eg.features[13].key = eg.atoms[S1L] - eg.features[14].key = eg.atoms[S0l2L] - eg.features[15].key = eg.atoms[S0lL] - eg.features[16].key = eg.atoms[S0L] - eg.features[17].key = eg.atoms[S0r2L] - eg.features[18].key = eg.atoms[S0rL] + +cdef inline FeatureC* _add_token(FeatureC* feats, + int slot, const TokenC* token, weight_t value) nogil: + # Word + feats.i = slot + feats.key = token.lex.norm + feats.value = value + feats += 1 + # POS tag + feats.i = slot+1 + feats.key = token.tag + feats.value = value + feats += 1 + # Dependency label + feats.i = slot+2 + feats.key = token.dep + feats.value = value + feats += 1 + # Word, label, tag + feats.i = slot+3 + cdef uint64_t key[3] + key[0] = token.lex.cluster + key[1] = token.tag + key[2] = token.dep + feats.key = hash64(key, sizeof(key), 0) + feats.value = value + feats += 1 + return feats + + +cdef inline FeatureC* _add_subtree(FeatureC* feats, int slot, const StateC* state, int t) nogil: + value = 1.0 + for i in range(state.n_R(t)): + feats = _add_token(feats, slot, state.R_(t, i+1), value) + value *= 0.5 + slot += 4 + value = 1.0 + for i in range(state.n_L(t)): + feats = _add_token(feats, slot, state.L_(t, i+1), value) + value *= 0.5 + return feats + + +cdef inline FeatureC* _add_pos_bigram(FeatureC* feat, int slot, + const TokenC* t1, const TokenC* t2) nogil: + cdef uint64_t[2] key + key[0] = t1.tag + key[1] = t2.tag + feat.i = slot + feat.key = hash64(key, sizeof(key), slot) + feat.value = 1.0 + return feat+1 + + +cdef inline FeatureC* _add_pos_trigram(FeatureC* feat, int slot, + const TokenC* t1, const TokenC* t2, const TokenC* t3) nogil: + cdef uint64_t[3] key + key[0] = t1.tag + key[1] = t2.tag + key[2] = t3.tag + feat.i = slot + feat.key = hash64(key, sizeof(key), slot) + feat.value = 1.0 + return feat+1 + +cdef class ParserNeuralNetEnsemble(ParserNeuralNet): + def __init__(self, shape, update_step='sgd', eta=0.01, rho=0.0, n=5): + ParserNeuralNet.__init__(self, shape, update_step=update_step, eta=eta, rho=rho) + self._models_c = self.mem.alloc(sizeof(NeuralNetC*), n) + self._masks = self.mem.alloc(sizeof(int*), n) + self._models = [] + cdef ParserNeuralNet model + threshold = 1.5 / n + self._nr_model = n + for i in range(n): + self._masks[i] = self.mem.alloc(sizeof(int), self.nr_feat) + for j in range(self.nr_feat): + self._masks[i][j] = random.random() < threshold + # We have to pass our pool here, because the embedding table passes + # it around. + model = ParserNeuralNet(shape, update_step=update_step, eta=eta, rho=rho) + self._models_c[i] = &model.c + self._models.append(model) + + property eta: + def __get__(self): + return self._models[0].eta + + def __set__(self, weight_t value): + for model in self._models: + model.eta = value + + def sparsify_embeddings(self, penalty): + p = 0.0 + for model in self._models: + p += model.sparsify_embeddings(penalty) + return p / len(self._models) + + cdef void set_scoresC(self, weight_t* scores, const void* _feats, + int nr_feat, int is_sparse) nogil: + nr_class = self.c.widths[self.c.nr_layer-1] + sub_scores = calloc(sizeof(weight_t), nr_class) + sub_feats = calloc(sizeof(FeatureC), nr_feat) + feats = _feats + for i in range(self._nr_model): + for j in range(nr_feat): + sub_feats[j] = feats[j] + sub_feats[j].value *= self._masks[i][j] + self.c = self._models_c[i][0] + self.c.weights = self._models_c[i].weights + self.c.gradient = self._models_c[i].gradient + ParserNeuralNet.set_scoresC(self, sub_scores, sub_feats, nr_feat, 1) + for j in range(nr_class): + scores[j] += sub_scores[j] + sub_scores[j] = 0.0 + for j in range(nr_class): + scores[j] /= self._nr_model + free(sub_feats) + free(sub_scores) + + def update(self, Example eg): + if eg.cost == 0: + return 0.0 + loss = 0.0 + full_feats = calloc(sizeof(FeatureC), eg.nr_feat) + memcpy(full_feats, eg.c.features, sizeof(FeatureC) * eg.nr_feat) + cdef ParserNeuralNet model + for i, model in enumerate(self._models): + for j in range(eg.nr_feat): + eg.c.features[j].value *= self._masks[i][j] + loss += model.update(eg) + memcpy(eg.c.features, full_feats, sizeof(FeatureC) * eg.nr_feat) + free(full_feats) + return loss + + def end_training(self): + for model in self._models: + model.end_training() cdef class Parser: - def __init__(self, StringStore strings, transition_system, ParserNeuralNet model, - int projectivize = 0): + def __init__(self, StringStore strings, transition_system, model): self.moves = transition_system self.model = model - self._projectivize = projectivize @classmethod def from_dir(cls, model_dir, strings, transition_system): @@ -148,16 +318,24 @@ cdef class Parser: print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory" cfg = Config.read(model_dir, 'config') moves = transition_system(strings, cfg.labels) - model = ParserNeuralNet(moves.n_moves, hidden_width=cfg.hidden_width, - depth=cfg.depth, word_width=cfg.word_width, - tag_width=cfg.tag_width, dep_width=cfg.dep_width, - update_step=cfg.update_step, - eta=cfg.eta, rho=cfg.rho) - project = cfg.projectivize if hasattr(cfg,'projectivize') else False + if cfg.get('model') == 'neural': + shape = [cfg.vector_widths, cfg.slots, cfg.feat_set] + shape.extend(cfg.hidden_layers) + shape.append(moves.n_moves) + if cfg.get('ensemble_size') >= 2: + model = ParserNeuralNetEnsemble(shape, update_step=cfg.update_step, + eta=cfg.eta, rho=cfg.rho, + n=cfg.ensemble_size) + else: + model = ParserNeuralNet(shape, update_step=cfg.update_step, + eta=cfg.eta, rho=cfg.rho) + else: + model = ParserPerceptron(get_templates(cfg.feat_set)) + if path.exists(path.join(model_dir, 'model')): model.load(path.join(model_dir, 'model')) - return cls(strings, moves, model, project) + return cls(strings, moves, model) @classmethod def load(cls, pkg_or_str_or_file, vocab): @@ -253,18 +431,18 @@ cdef class Parser: widths=self.model.widths, nr_atom=CONTEXT_SIZE, nr_feat=self.model.nr_feat) - cdef weight_t loss = 0 + loss = 0 cdef Transition action while not stcls.is_final(): self.model.set_featuresC(eg.c, stcls.c) + self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat, 1) self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) - - # Sets eg.c.scores, which Example uses to calculate eg.guess - self.model.updateC(eg.c) - - action = self.moves.c[eg.guess] + guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) + assert guess >= 0 + action = self.moves.c[guess] action.do(stcls.c, action.label) - loss += eg.loss + + loss += self.model.update(eg) eg.reset() return loss diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 51e465188..9b2d10f89 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -7,7 +7,7 @@ from .vocab cimport Vocab cdef class TaggerModel(AveragedPerceptron): - cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except * + cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil cdef class Tagger: diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 991e008ad..e1c3d9b07 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -71,13 +71,13 @@ cpdef enum: cdef class TaggerModel(AveragedPerceptron): - cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: - - _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) - _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) - _fill_from_token(&eg.atoms[W_orth], &tokens[i]) - _fill_from_token(&eg.atoms[N1_orth], &tokens[i+1]) - _fill_from_token(&eg.atoms[N2_orth], &tokens[i+2]) + cdef void set_featuresC(self, ExampleC* eg, const void* _token) nogil: + token = _token + _fill_from_token(&eg.atoms[P2_orth], token - 2) + _fill_from_token(&eg.atoms[P1_orth], token - 1) + _fill_from_token(&eg.atoms[W_orth], token) + _fill_from_token(&eg.atoms[N1_orth], token + 1) + _fill_from_token(&eg.atoms[N2_orth], token + 2) eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) @@ -153,7 +153,7 @@ cdef class Tagger: @classmethod def from_package(cls, pkg, vocab): # TODO: templates.json deprecated? not present in latest package - # templates = cls.default_templates() + #templates = cls.default_templates() templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates()) model = TaggerModel(templates) @@ -202,12 +202,13 @@ cdef class Tagger: nr_feat=self.model.nr_feat) for i in range(tokens.length): if tokens.c[i].pos == 0: - self.model.set_featuresC(eg.c, tokens.c, i) + self.model.set_featuresC(eg.c, &tokens.c[i]) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat, 1) guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) self.vocab.morphology.assign_tag(&tokens.c[i], guess) eg.fill_scores(0, eg.c.nr_class) + eg.reset() tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length @@ -231,18 +232,15 @@ cdef class Tagger: nr_class=self.vocab.morphology.n_tags, nr_feat=self.model.nr_feat) for i in range(tokens.length): - self.model.set_featuresC(eg.c, tokens.c, i) + self.model.set_featuresC(eg.c, &tokens.c[i]) eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ] self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat, 1) - self.model.updateC(eg.c) - self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess) - + self.model.update(eg) correct += eg.cost == 0 self.freqs[TAG][tokens.c[i].tag] += 1 - eg.fill_scores(0, eg.c.nr_class) - eg.fill_costs(0, eg.c.nr_class) + eg.reset() tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length return correct