'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes .conllu format for development data, allowing the official scorer to be used. ''' from __future__ import unicode_literals import plac import tqdm import re import spacy import spacy.util from spacy.gold import GoldParse, minibatch from spacy.syntax.nonproj import projectivize from collections import Counter from timeit import default_timer as timer from spacy._align import align def prevent_bad_sentences(doc): '''This is an example pipeline component for fixing sentence segmentation mistakes. The component sets is_sent_start to False, which means the parser will be prevented from making a sentence boundary there. The rules here aren't necessarily a good idea.''' for token in doc[1:]: if token.nbor(-1).text == ',': token.is_sent_start = False elif not token.nbor(-1).whitespace_: token.is_sent_start = False elif not token.nbor(-1).is_punct: token.is_sent_start = False return doc def load_model(lang): '''This shows how to adjust the tokenization rules, to special-case for ways the CoNLLU tokenization differs. We need to get the tokenizer accuracy high on the various treebanks in order to do well. If we don't align on a content word, all dependencies to and from that word will be marked as incorrect. ''' English = spacy.util.get_lang_class(lang) English.Defaults.infixes += ('(?<=[^-\d])[+\-\*^](?=[^-\d])',) English.Defaults.infixes += ('(?<=[^-])[+\-\*^](?=[^-\d])',) English.Defaults.infixes += ('(?<=[^-\d])[+\-\*^](?=[^-])',) English.Defaults.token_match = re.compile(r'=+').match nlp = English() nlp.tokenizer.add_special_case('***', [{'ORTH': '***'}]) nlp.tokenizer.add_special_case("):", [{'ORTH': ")"}, {"ORTH": ":"}]) nlp.tokenizer.add_special_case("and/or", [{'ORTH': "and"}, {"ORTH": "/"}, {"ORTH": "or"}]) nlp.tokenizer.add_special_case("non-Microsoft", [{'ORTH': "non-Microsoft"}]) nlp.tokenizer.add_special_case("mis-matches", [{'ORTH': "mis-matches"}]) nlp.tokenizer.add_special_case("X.", [{'ORTH': "X"}, {"ORTH": "."}]) nlp.tokenizer.add_special_case("b/c", [{'ORTH': "b/c"}]) return nlp def get_token_acc(docs, golds): '''Quick function to evaluate tokenization accuracy.''' miss = 0 hit = 0 for doc, gold in zip(docs, golds): for i in range(len(doc)): token = doc[i] align = gold.words[i] if align == None: miss += 1 else: hit += 1 return miss, hit def golds_to_gold_tuples(docs, golds): '''Get out the annoying 'tuples' format used by begin_training, given the GoldParse objects.''' tuples = [] for doc, gold in zip(docs, golds): text = doc.text ids, words, tags, heads, labels, iob = zip(*gold.orig_annot) sents = [((ids, words, tags, heads, labels, iob), [])] tuples.append((text, sents)) return tuples def split_text(text): paragraphs = text.split('\n\n') paragraphs = [par.strip().replace('\n', ' ') for par in paragraphs] return paragraphs def read_conllu(file_): docs = [] doc = [] sent = [] for line in file_: if line.startswith('# newdoc'): if doc: docs.append(doc) doc = [] elif line.startswith('#'): continue elif not line.strip(): if sent: doc.append(sent) sent = [] else: sent.append(line.strip().split()) if sent: doc.append(sent) if doc: docs.append(doc) return docs def get_docs(nlp, text): paragraphs = split_text(text) docs = [nlp.make_doc(par) for par in paragraphs] return docs def get_golds(docs, conllu): # sd is spacy doc; cd is conllu doc # cs is conllu sent, ct is conllu token golds = [] for sd, cd in zip(docs, conllu): words = [] tags = [] heads = [] deps = [] for cs in cd: for id_, word, lemma, pos, tag, morph, head, dep, _1, _2 in cs: if '.' in id_: continue i = len(words) id_ = int(id_)-1 head = int(head)-1 if head != '0' else id_ head_dist = head - id_ words.append(word) tags.append(tag) heads.append(i+head_dist) deps.append('ROOT' if dep == 'root' else dep) heads, deps = projectivize(heads, deps) entities = ['-'] * len(words) gold = GoldParse(sd, words=words, tags=tags, heads=heads, deps=deps, entities=entities) golds.append(gold) return golds def parse_dev_data(nlp, text_loc, conllu_loc): with open(text_loc) as file_: docs = get_docs(nlp, file_.read()) with open(conllu_loc) as file_: conllu_dev = read_conllu(file_) golds = list(get_golds(docs, conllu_dev)) scorer = nlp.evaluate(zip(docs, golds)) return docs, scorer def print_progress(itn, losses, scorer): scores = {} for col in ['dep_loss', 'tag_loss', 'uas', 'tags_acc', 'token_acc', 'ents_p', 'ents_r', 'ents_f', 'cpu_wps', 'gpu_wps']: scores[col] = 0.0 scores['dep_loss'] = losses.get('parser', 0.0) scores['ner_loss'] = losses.get('ner', 0.0) scores['tag_loss'] = losses.get('tagger', 0.0) scores.update(scorer.scores) tpl = '\t'.join(( '{:d}', '{dep_loss:.3f}', '{ner_loss:.3f}', '{uas:.3f}', '{ents_p:.3f}', '{ents_r:.3f}', '{ents_f:.3f}', '{tags_acc:.3f}', '{token_acc:.3f}', )) print(tpl.format(itn, **scores)) def print_conllu(docs, file_): for i, doc in enumerate(docs): file_.write("# newdoc id = {i}\n".format(i=i)) for j, sent in enumerate(doc.sents): file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) file_.write("# text = {text}\n".format(text=sent.text)) for k, t in enumerate(sent): if t.head.i == t.i: head = 0 else: head = k + (t.head.i - t.i) + 1 fields = [str(k+1), t.text, t.lemma_, t.pos_, t.tag_, '_', str(head), t.dep_, '_', '_'] file_.write('\t'.join(fields) + '\n') file_.write('\n') def main(spacy_model, conllu_train_loc, text_train_loc, conllu_dev_loc, text_dev_loc, output_loc): with open(conllu_train_loc) as file_: conllu_train = read_conllu(file_) nlp = load_model(spacy_model) print("Get docs") with open(text_train_loc) as file_: docs = get_docs(nlp, file_.read()) golds = list(get_golds(docs, conllu_train)) print("Create parser") nlp.add_pipe(nlp.create_pipe('parser')) nlp.add_pipe(nlp.create_pipe('tagger')) for gold in golds: for tag in gold.tags: if tag is not None: nlp.tagger.add_label(tag) optimizer = nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds)) n_train_words = sum(len(doc) for doc in docs) print(n_train_words) print("Begin training") for i in range(10): with open(text_train_loc) as file_: docs = get_docs(nlp, file_.read()) docs = docs[:len(golds)] with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in minibatch(list(zip(docs, golds)), size=1): if not batch: continue batch_docs, batch_gold = zip(*batch) nlp.update(batch_docs, batch_gold, sgd=optimizer, drop=0.2, losses=losses) pbar.update(sum(len(doc) for doc in batch_docs)) with nlp.use_params(optimizer.averages): dev_docs, scorer = parse_dev_data(nlp, text_dev_loc, conllu_dev_loc) print_progress(i, losses, scorer) with open(output_loc, 'w') as file_: print_conllu(dev_docs, file_) if __name__ == '__main__': plac.call(main)