mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-24 20:51:30 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			202 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			202 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from __future__ import unicode_literals, print_function
 | |
| import plac
 | |
| import json
 | |
| import random
 | |
| import pathlib
 | |
| 
 | |
| from spacy.tokens import Doc
 | |
| from spacy.syntax.nonproj import PseudoProjectivity
 | |
| from spacy.language import Language
 | |
| from spacy.gold import GoldParse
 | |
| from spacy.tagger import Tagger
 | |
| from spacy.pipeline import DependencyParser, TokenVectorEncoder
 | |
| from spacy.syntax.parser import get_templates
 | |
| from spacy.syntax.arc_eager import ArcEager
 | |
| from spacy.scorer import Scorer
 | |
| from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
 | |
| import spacy.attrs
 | |
| import io
 | |
| from thinc.neural.ops import CupyOps
 | |
| from thinc.neural import Model
 | |
| from spacy.es import Spanish
 | |
| from spacy.attrs import POS
 | |
| 
 | |
| 
 | |
| from thinc.neural import Model
 | |
| 
 | |
| 
 | |
| try:
 | |
|     import cupy
 | |
|     from thinc.neural.ops import CupyOps
 | |
| except:
 | |
|     cupy = None
 | |
| 
 | |
| 
 | |
| def read_conllx(loc, n=0):
 | |
|     with io.open(loc, 'r', encoding='utf8') as file_:
 | |
|         text = file_.read()
 | |
|     i = 0
 | |
|     for sent in text.strip().split('\n\n'):
 | |
|         lines = sent.strip().split('\n')
 | |
|         if lines:
 | |
|             while lines[0].startswith('#'):
 | |
|                 lines.pop(0)
 | |
|             tokens = []
 | |
|             for line in lines:
 | |
|                 id_, word, lemma, pos, tag, morph, head, dep, _1, \
 | |
|                 _2 = line.split('\t')
 | |
|                 if '-' in id_ or '.' in id_:
 | |
|                     continue
 | |
|                 try:
 | |
|                     id_ = int(id_) - 1
 | |
|                     head = (int(head) - 1) if head != '0' else id_
 | |
|                     dep = 'ROOT' if dep == 'root' else dep #'unlabelled'
 | |
|                     tag = pos+'__'+dep+'__'+morph
 | |
|                     Spanish.Defaults.tag_map[tag] = {POS: pos}
 | |
|                     tokens.append((id_, word, tag, head, dep, 'O'))
 | |
|                 except:
 | |
|                     raise
 | |
|             tuples = [list(t) for t in zip(*tokens)]
 | |
|             yield (None, [[tuples, []]])
 | |
|             i += 1
 | |
|             if n >= 1 and i >= n:
 | |
|                 break
 | |
| 
 | |
| 
 | |
| def score_model(vocab, encoder, parser, Xs, ys, verbose=False):
 | |
|     scorer = Scorer()
 | |
|     correct = 0.
 | |
|     total = 0.
 | |
|     for doc, gold in zip(Xs, ys):
 | |
|         doc = Doc(vocab, words=[w.text for w in doc])
 | |
|         encoder(doc)
 | |
|         parser(doc)
 | |
|         PseudoProjectivity.deprojectivize(doc)
 | |
|         scorer.score(doc, gold, verbose=verbose)
 | |
|         for token, tag in zip(doc, gold.tags):
 | |
|             if '_' in token.tag_:
 | |
|                 univ_guess, _ = token.tag_.split('_', 1)
 | |
|             else:
 | |
|                 univ_guess = ''
 | |
|             univ_truth, _ = tag.split('_', 1)
 | |
|             correct += univ_guess == univ_truth
 | |
|             total += 1
 | |
|     return scorer
 | |
| 
 | |
| 
 | |
| def organize_data(vocab, train_sents):
 | |
|     Xs = []
 | |
|     ys = []
 | |
|     for _, doc_sents in train_sents:
 | |
|         for (ids, words, tags, heads, deps, ner), _ in doc_sents:
 | |
|             doc = Doc(vocab, words=words)
 | |
|             gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
 | |
|             Xs.append(doc)
 | |
|             ys.append(gold)
 | |
|     return Xs, ys
 | |
| 
 | |
| 
 | |
| def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
 | |
|     LangClass = spacy.util.get_lang_class(lang_name)
 | |
|     train_sents = list(read_conllx(train_loc))
 | |
|     dev_sents = list(read_conllx(dev_loc))
 | |
|     train_sents = PseudoProjectivity.preprocess_training_data(train_sents)
 | |
| 
 | |
|     actions = ArcEager.get_actions(gold_parses=train_sents)
 | |
|     features = get_templates('basic')
 | |
| 
 | |
|     model_dir = pathlib.Path(model_dir)
 | |
|     if not model_dir.exists():
 | |
|         model_dir.mkdir()
 | |
|     if not (model_dir / 'deps').exists():
 | |
|         (model_dir / 'deps').mkdir()
 | |
|     if not (model_dir / 'pos').exists():
 | |
|         (model_dir / 'pos').mkdir()
 | |
|     with (model_dir / 'deps' / 'config.json').open('wb') as file_:
 | |
|         file_.write(
 | |
|             json.dumps(
 | |
|                 {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
 | |
| 
 | |
|     vocab = LangClass.Defaults.create_vocab()
 | |
|     if not (model_dir / 'vocab').exists():
 | |
|         (model_dir / 'vocab').mkdir()
 | |
|     else:
 | |
|         if (model_dir / 'vocab' / 'strings.json').exists():
 | |
|             with (model_dir / 'vocab' / 'strings.json').open() as file_:
 | |
|                 vocab.strings.load(file_)
 | |
|             if (model_dir / 'vocab' / 'lexemes.bin').exists():
 | |
|                 vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
 | |
| 
 | |
|     if clusters_loc is not None:
 | |
|         clusters_loc = pathlib.Path(clusters_loc)
 | |
|         with clusters_loc.open() as file_:
 | |
|             for line in file_:
 | |
|                 try:
 | |
|                     cluster, word, freq = line.split()
 | |
|                 except ValueError:
 | |
|                     continue
 | |
|                 lex = vocab[word]
 | |
|                 lex.cluster = int(cluster[::-1], 2)
 | |
|     # Populate vocab
 | |
|     for _, doc_sents in train_sents:
 | |
|         for (ids, words, tags, heads, deps, ner), _ in doc_sents:
 | |
|             for word in words:
 | |
|                 _ = vocab[word]
 | |
|             for dep in deps:
 | |
|                 _ = vocab[dep]
 | |
|             for tag in tags:
 | |
|                 _ = vocab[tag]
 | |
|             if vocab.morphology.tag_map:
 | |
|                 for tag in tags:
 | |
|                     vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]}
 | |
|     tagger = Tagger(vocab)
 | |
|     encoder = TokenVectorEncoder(vocab, width=64)
 | |
|     parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
 | |
| 
 | |
|     Xs, ys = organize_data(vocab, train_sents)
 | |
|     dev_Xs, dev_ys = organize_data(vocab, dev_sents)
 | |
|     with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
 | |
|         docs = list(Xs)
 | |
|         for doc in docs:
 | |
|             encoder(doc)
 | |
|         nn_loss = [0.]
 | |
|         def track_progress():
 | |
|             with encoder.tagger.use_params(optimizer.averages):
 | |
|                 with parser.model.use_params(optimizer.averages):
 | |
|                     scorer = score_model(vocab, encoder, parser, dev_Xs, dev_ys)
 | |
|             itn = len(nn_loss)
 | |
|             print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
 | |
|             nn_loss.append(0.)
 | |
|         track_progress()
 | |
|         trainer.each_epoch.append(track_progress)
 | |
|         trainer.batch_size = 24
 | |
|         trainer.nb_epoch = 40
 | |
|         for docs, golds in trainer.iterate(Xs, ys, progress_bar=True):
 | |
|             docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
 | |
|             tokvecs, upd_tokvecs = encoder.begin_update(docs)
 | |
|             for doc, tokvec in zip(docs, tokvecs):
 | |
|                 doc.tensor = tokvec
 | |
|             d_tokvecs = parser.update(docs, golds, sgd=optimizer)
 | |
|             upd_tokvecs(d_tokvecs, sgd=optimizer)
 | |
|             encoder.update(docs, golds, sgd=optimizer)
 | |
|     nlp = LangClass(vocab=vocab, parser=parser)
 | |
|     scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc))
 | |
|     print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
 | |
|     #nlp.end_training(model_dir)
 | |
|     #scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
 | |
|     #print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     import cProfile
 | |
|     import pstats
 | |
|     if 1:
 | |
|         plac.call(main)
 | |
|     else:
 | |
|         cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
 | |
|     s = pstats.Stats("Profile.prof")
 | |
|     s.strip_dirs().sort_stats("time").print_stats()
 | |
| 
 | |
| 
 | |
|     plac.call(main)
 |