mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge pull request #901 from raphael0202/train_ud
Split CONLLX file using tabs and not default split separators
This commit is contained in:
		
						commit
						f4010053a6
					
				| 
						 | 
					@ -1,18 +1,13 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
import plac
 | 
					import plac
 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
from os import path
 | 
					 | 
				
			||||||
import shutil
 | 
					 | 
				
			||||||
import os
 | 
					 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import io
 | 
					 | 
				
			||||||
import pathlib
 | 
					import pathlib
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
from spacy.syntax.nonproj import PseudoProjectivity
 | 
					from spacy.syntax.nonproj import PseudoProjectivity
 | 
				
			||||||
from spacy.language import Language
 | 
					from spacy.language import Language
 | 
				
			||||||
from spacy.gold import GoldParse
 | 
					from spacy.gold import GoldParse
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					 | 
				
			||||||
from spacy.tagger import Tagger
 | 
					from spacy.tagger import Tagger
 | 
				
			||||||
from spacy.pipeline import DependencyParser, BeamDependencyParser
 | 
					from spacy.pipeline import DependencyParser, BeamDependencyParser
 | 
				
			||||||
from spacy.syntax.parser import get_templates
 | 
					from spacy.syntax.parser import get_templates
 | 
				
			||||||
| 
						 | 
					@ -23,7 +18,6 @@ import spacy.attrs
 | 
				
			||||||
import io
 | 
					import io
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
def read_conllx(loc, n=0):
 | 
					def read_conllx(loc, n=0):
 | 
				
			||||||
    with io.open(loc, 'r', encoding='utf8') as file_:
 | 
					    with io.open(loc, 'r', encoding='utf8') as file_:
 | 
				
			||||||
        text = file_.read()
 | 
					        text = file_.read()
 | 
				
			||||||
| 
						 | 
					@ -35,7 +29,8 @@ def read_conllx(loc, n=0):
 | 
				
			||||||
                lines.pop(0)
 | 
					                lines.pop(0)
 | 
				
			||||||
            tokens = []
 | 
					            tokens = []
 | 
				
			||||||
            for line in lines:
 | 
					            for line in lines:
 | 
				
			||||||
                id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split()
 | 
					                id_, word, lemma, pos, tag, morph, head, dep, _1, \
 | 
				
			||||||
 | 
					                _2 = line.split('\t')
 | 
				
			||||||
                if '-' in id_ or '.' in id_:
 | 
					                if '-' in id_ or '.' in id_:
 | 
				
			||||||
                    continue
 | 
					                    continue
 | 
				
			||||||
                try:
 | 
					                try:
 | 
				
			||||||
| 
						 | 
					@ -134,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
 | 
				
			||||||
        random.shuffle(train_sents)
 | 
					        random.shuffle(train_sents)
 | 
				
			||||||
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
 | 
					        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
 | 
				
			||||||
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
 | 
					        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
 | 
				
			||||||
    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
 | 
					    nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
 | 
				
			||||||
    nlp.end_training(model_dir)
 | 
					    nlp.end_training(model_dir)
 | 
				
			||||||
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
 | 
					    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
 | 
				
			||||||
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
 | 
					    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user