* Use updated JSON format, with sentences below paragraphs. Allows use of gold preprocessing flag.

2025-08-05 21:00:19 +03:00 · 2015-05-30 01:25:46 +02:00 · 2015-05-30 01:25:46 +02:00 · 76300bbb1b
commit 76300bbb1b
parent 2d11739f28
4 changed files with 85 additions and 67 deletions
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -81,21 +81,21 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
-        for raw_text, annot_tuples, ctnt in gold_tuples:
-            score_model(scorer, nlp, raw_text, annot_tuples)
-            if raw_text is None:
-                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-            else:
-                tokens = nlp.tokenizer(raw_text)
-            gold = GoldParse(tokens, annot_tuples)
-            nlp.tagger(tokens)
-            try:
-                loss += nlp.parser.train(tokens, gold)
-            except AssertionError:
-                # TODO: Do something about non-projective sentences
-                pass
-            nlp.entity.train(tokens, gold)
-            nlp.tagger.train(tokens, gold.tags)
+        for raw_text, sents in gold_tuples:
+            if not gold_preproc:
+                sents = _merge_sents(sents)
+            for annot_tuples, ctnt in sents:
+                score_model(scorer, nlp, raw_text, annot_tuples)
+                if raw_text is None or gold_preproc:
+                    tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+                else:
+                    tokens = nlp.tokenizer(raw_text)
+                gold = GoldParse(tokens, annot_tuples)
+                nlp.tagger(tokens)
+                if gold.is_projective:
+                    loss += nlp.parser.train(tokens, gold)
+                nlp.entity.train(tokens, gold)
+                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                               scorer.tags_acc,
@ -107,19 +107,21 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0


 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=True):
-    assert not gold_preproc
    nlp = Language(data_dir=model_dir)
    scorer = Scorer()
-    for raw_text, annot_tuples, brackets in gold_tuples:
-        if raw_text is not None:
-            tokens = nlp(raw_text, merge_mwes=False)
-        else:
-            tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
-            nlp.tagger(tokens)
-            nlp.entity(tokens)
-            nlp.parser(tokens)
-        gold = GoldParse(tokens, annot_tuples)
-        scorer.score(tokens, gold, verbose=verbose)
+    for raw_text, sents in gold_tuples:
+        for annot_tuples, brackets in sents:
+            if raw_text is None or gold_preproc:
+                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
+                nlp.tagger(tokens)
+                nlp.entity(tokens)
+                nlp.parser(tokens)
+            else:
+                tokens = nlp(raw_text, merge_mwes=False)
+            gold = GoldParse(tokens, annot_tuples)
+            scorer.score(tokens, gold, verbose=verbose)
+            for t in tokens:
+                print t.orth_, t.dep_, t.head.orth_, t.ent_type_
    return scorer


@ -141,6 +143,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
    train_loc=("Location of training file or directory"),
    dev_loc=("Location of development file or directory"),
    corruption_level=("Amount of noise to add to training data", "option", "c", float),
+    gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool),
    model_dir=("Location of output model directory",),
    out_loc=("Out location", "option", "o", str),
    n_sents=("Number of training sentences", "option", "n", int),
@ -149,16 +152,16 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
    debug=("Debug mode", "flag", "d", bool)
 )
 def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
-         debug=False, corruption_level=0.0):
+         debug=False, corruption_level=0.0, gold_preproc=False):
    gold_train = list(read_json_file(train_loc))
    train(English, gold_train, model_dir,
          feat_set='basic' if not debug else 'debug',
-          gold_preproc=False, n_sents=n_sents,
+          gold_preproc=gold_preproc, n_sents=n_sents,
          corruption_level=corruption_level, n_iter=n_iter)
-    if out_loc:
-        write_parses(English, dev_loc, model_dir, out_loc)
+    #if out_loc:
+    #    write_parses(English, dev_loc, model_dir, out_loc)
    scorer = evaluate(English, list(read_json_file(dev_loc)),
-                      model_dir, gold_preproc=False, verbose=verbose)
+                      model_dir, gold_preproc=gold_preproc, verbose=verbose)
    print 'TOK', 100-scorer.token_acc
    print 'POS', scorer.tags_acc
    print 'UAS', scorer.uas
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -104,24 +104,25 @@ def read_json_file(loc):
            for doc in ijson.items(file_, 'item'):
                paragraphs = []
                for paragraph in doc['paragraphs']:
-                    words = []
-                    ids = []
-                    tags = []
-                    heads = []
-                    labels = []
-                    ner = []
-                    for token in paragraph['tokens']:
-                        words.append(token['orth'])
-                        ids.append(token['id'])
-                        tags.append(token['tag'])
-                        heads.append(token['head'] if token['head'] >= 0 else token['id'])
-                        labels.append(token['dep'])
-                        ner.append(token.get('ner', '-'))
-
-                    yield (
-                        paragraph.get('raw', None),
-                        (ids, words, tags, heads, labels, ner),
-                        paragraph.get('brackets', []))
+                    sents = []
+                    for sent in paragraph['sentences']:
+                        words = []
+                        ids = []
+                        tags = []
+                        heads = []
+                        labels = []
+                        ner = []
+                        for i, token in enumerate(sent['tokens']):
+                            words.append(token['orth'])
+                            ids.append(i)
+                            tags.append(token['tag'])
+                            heads.append(token['head'] + i)
+                            labels.append(token['dep'])
+                            ner.append(token.get('ner', '-'))
+                        sents.append((
+                            (ids, words, tags, heads, labels, ner),
+                            sent.get('brackets', [])))
+                    yield (paragraph.get('raw', None), sents)


 def _iob_to_biluo(tags):
@ -203,6 +204,19 @@ cdef class GoldParse:
    def __len__(self):
        return self.length

+    @property
+    def is_projective(self):
+        heads = [head for (id_, word, tag, head, dep, ner) in self.orig_annot]
+        deps = sorted([sorted(arc) for arc in enumerate(heads)])
+        for w1, h1 in deps:
+            for w2, h2 in deps:
+                if w1 < w2 < h1 < h2:
+                    return False
+                elif w1 < w2 == h2 < h1:
+                    return False
+        else:
+            return True
+

 def is_punct_label(label):
    return label == 'P' or label.lower() == 'punct'
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -54,15 +54,16 @@ cdef class ArcEager(TransitionSystem):
        move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {},
                       LEFT: {'ROOT': True}, BREAK: {'ROOT': True},
                       CONSTITUENT: {}, ADJUST: {'': True}}
-        for raw_text, (ids, words, tags, heads, labels, iob), ctnts in gold_parses:
-            for child, head, label in zip(ids, heads, labels):
-                if label != 'ROOT':
-                    if head < child:
-                        move_labels[RIGHT][label] = True
-                    elif head > child:
-                        move_labels[LEFT][label] = True
-            for start, end, label in ctnts:
-                move_labels[CONSTITUENT][label] = True
+        for raw_text, sents in gold_parses:
+            for (ids, words, tags, heads, labels, iob), ctnts in sents:
+                for child, head, label in zip(ids, heads, labels):
+                    if label != 'ROOT':
+                        if head < child:
+                            move_labels[RIGHT][label] = True
+                        elif head > child:
+                            move_labels[LEFT][label] = True
+                for start, end, label in ctnts:
+                    move_labels[CONSTITUENT][label] = True
        return move_labels

    cdef int preprocess_gold(self, GoldParse gold) except -1:
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -73,15 +73,15 @@ cdef class BiluoPushDown(TransitionSystem):
        move_labels = {MISSING: {'': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {},
                       OUT: {'': True}}
        moves = ('M', 'B', 'I', 'L', 'U')
-        for (raw_text, tuples, ctnt) in gold_tuples:
-            ids, words, tags, heads, labels, biluo = tuples
-            for i, ner_tag in enumerate(biluo):
-                if ner_tag != 'O' and ner_tag != '-':
-                    if ner_tag.count('-') != 1:
-                        raise ValueError(ner_tag)
-                    _, label = ner_tag.split('-')
-                    for move_str in ('B', 'I', 'L', 'U'):
-                        move_labels[moves.index(move_str)][label] = True
+        for raw_text, sents in gold_tuples:
+            for (ids, words, tags, heads, labels, biluo), _ in sents:
+                for i, ner_tag in enumerate(biluo):
+                    if ner_tag != 'O' and ner_tag != '-':
+                        if ner_tag.count('-') != 1:
+                            raise ValueError(ner_tag)
+                        _, label = ner_tag.split('-')
+                        for move_str in ('B', 'I', 'L', 'U'):
+                            move_labels[moves.index(move_str)][label] = True
        return move_labels

    def move_name(self, int move, int label):