Merge pull request #901 from raphael0202/train_ud

Split CONLLX file using tabs and not default split separators
2025-11-03 01:17:52 +03:00 · 2017-03-21 23:39:45 +01:00 · 2017-03-21 23:39:45 +01:00 · f4010053a6
commit f4010053a6
parent 81b28ca606 08346dba1a
1 changed files with 3 additions and 8 deletions
--- a/bin/parser/train_ud.py
+++ b/bin/parser/train_ud.py
@ -1,18 +1,13 @@
 from __future__ import unicode_literals
 import plac
 import json
 from os import path
 import shutil
 import os
 import random
 import io
 import pathlib
 from spacy.tokens import Doc
 from spacy.syntax.nonproj import PseudoProjectivity
 from spacy.language import Language
 from spacy.gold import GoldParse
 from spacy.vocab import Vocab
 from spacy.tagger import Tagger
 from spacy.pipeline import DependencyParser, BeamDependencyParser
 from spacy.syntax.parser import get_templates
@ -23,7 +18,6 @@ import spacy.attrs
 import io
 def read_conllx(loc, n=0):
    with io.open(loc, 'r', encoding='utf8') as file_:
        text = file_.read()
@ -35,7 +29,8 @@ def read_conllx(loc, n=0):
                lines.pop(0)
            tokens = []
            for line in lines:
-                id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
+                id_, word, lemma, pos, tag, morph, head, dep, _1, \
                _2 = line.split('\t')
                if '-' in id_ or '.' in id_:
                    continue
                try:
@ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
-    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
+    nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))