spaCy/examples/training/conllu.py
Sofie Van Landeghem e48a09df4e Example class for training data (#4543)
* OrigAnnot class instead of gold.orig_annot list of zipped tuples

* from_orig to replace from_annot_tuples

* rename to RawAnnot

* some unit tests for GoldParse creation and internal format

* removing orig_annot and switching to lists instead of tuple

* rewriting tuples to use RawAnnot (+ debug statements, WIP)

* fix pop() changing the data

* small fixes

* pop-append fixes

* return RawAnnot for existing GoldParse to have uniform interface

* clean up imports

* fix merge_sents

* add unit test for 4402 with new structure (not working yet)

* introduce DocAnnot

* typo fixes

* add unit test for merge_sents

* rename from_orig to from_raw

* fixing unit tests

* fix nn parser

* read_annots to produce text, doc_annot pairs

* _make_golds fix

* rename golds_to_gold_annots

* small fixes

* fix encoding

* have golds_to_gold_annots use DocAnnot

* missed a spot

* merge_sents as function in DocAnnot

* allow specifying only part of the token-level annotations

* refactor with Example class + underlying dicts

* pipeline components to work with Example objects (wip)

* input checking

* fix yielding

* fix calls to update

* small fixes

* fix scorer unit test with new format

* fix kwargs order

* fixes for ud and conllu scripts

* fix reading data for conllu script

* add in proper errors (not fixed numbering yet to avoid merge conflicts)

* fixing few more small bugs

* fix EL script
2019-11-11 17:35:27 +01:00

439 lines
14 KiB
Python

"""Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
.conllu format for development data, allowing the official scorer to be used.
"""
from __future__ import unicode_literals
import plac
import attr
from pathlib import Path
import re
import json
import spacy
import spacy.util
from spacy.tokens import Token, Doc
from spacy.gold import GoldParse, Example
from spacy.syntax.nonproj import projectivize
from collections import defaultdict
from spacy.matcher import Matcher
import itertools
import random
import numpy.random
from bin.ud import conll17_ud_eval
import spacy.lang.zh
import spacy.lang.ja
spacy.lang.zh.Chinese.Defaults.use_jieba = False
spacy.lang.ja.Japanese.Defaults.use_janome = False
random.seed(0)
numpy.random.seed(0)
def minibatch_by_words(examples, size=5000):
random.shuffle(examples)
if isinstance(size, int):
size_ = itertools.repeat(size)
else:
size_ = size
examples = iter(examples)
while True:
batch_size = next(size_)
batch = []
while batch_size >= 0:
try:
example = next(examples)
except StopIteration:
if batch:
yield batch
return
batch_size -= len(example.doc)
batch.append(example)
if batch:
yield batch
else:
break
################
# Data reading #
################
space_re = re.compile("\s+")
def split_text(text):
return [space_re.sub(" ", par.strip()) for par in text.split("\n\n")]
def read_data(
nlp,
conllu_file,
text_file,
raw_text=True,
oracle_segments=False,
max_doc_length=None,
limit=None,
):
"""Read the CONLLU format into Example objects. If raw_text=True,
include Doc objects created using nlp.make_doc and then aligned against
the gold-standard sequences. If oracle_segments=True, include Doc objects
created from the gold-standard segments. At least one must be True."""
if not raw_text and not oracle_segments:
raise ValueError("At least one of raw_text or oracle_segments must be True")
paragraphs = split_text(text_file.read())
conllu = read_conllu(conllu_file)
# sd is spacy doc; cd is conllu doc
# cs is conllu sent, ct is conllu token
docs = []
golds = []
for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
sent_annots = []
for cs in cd:
sent = defaultdict(list)
for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
if "." in id_:
continue
if "-" in id_:
continue
id_ = int(id_) - 1
head = int(head) - 1 if head != "0" else id_
sent["words"].append(word)
sent["tags"].append(tag)
sent["heads"].append(head)
sent["deps"].append("ROOT" if dep == "root" else dep)
sent["spaces"].append(space_after == "_")
sent["entities"] = ["-"] * len(sent["words"])
sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
if oracle_segments:
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
golds.append(GoldParse(docs[-1], **sent))
sent_annots.append(sent)
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
doc, gold = _make_gold(nlp, None, sent_annots)
sent_annots = []
docs.append(doc)
golds.append(gold)
if limit and len(docs) >= limit:
return golds_to_gold_data(docs, golds)
if raw_text and sent_annots:
doc, gold = _make_gold(nlp, None, sent_annots)
docs.append(doc)
golds.append(gold)
if limit and len(docs) >= limit:
return golds_to_gold_data(docs, golds)
return golds_to_gold_data(docs, golds)
def read_conllu(file_):
docs = []
sent = []
doc = []
for line in file_:
if line.startswith("# newdoc"):
if doc:
docs.append(doc)
doc = []
elif line.startswith("#"):
continue
elif not line.strip():
if sent:
doc.append(sent)
sent = []
else:
sent.append(list(line.strip().split("\t")))
if len(sent[-1]) != 10:
print(repr(line))
raise ValueError
if sent:
doc.append(sent)
if doc:
docs.append(doc)
return docs
def _make_gold(nlp, text, sent_annots):
# Flatten the conll annotations, and adjust the head indices
flat = defaultdict(list)
for sent in sent_annots:
flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"])
for field in ["words", "tags", "deps", "entities", "spaces"]:
flat[field].extend(sent[field])
# Construct text if necessary
assert len(flat["words"]) == len(flat["spaces"])
if text is None:
text = "".join(
word + " " * space for word, space in zip(flat["words"], flat["spaces"])
)
doc = nlp.make_doc(text)
flat.pop("spaces")
gold = GoldParse(doc, **flat)
return doc, gold
#############################
# Data transforms for spaCy #
#############################
def golds_to_gold_data(docs, golds):
"""Get out the training data format used by begin_training, given the
GoldParse objects."""
data = []
for doc, gold in zip(docs, golds):
example = Example(doc=doc)
example.add_doc_annotation(cats=gold.cats)
token_annotation_dict = gold.orig.to_dict()
example.add_token_annotation(**token_annotation_dict)
example.goldparse = gold
data.append(example)
return data
##############
# Evaluation #
##############
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
with text_loc.open("r", encoding="utf8") as text_file:
texts = split_text(text_file.read())
docs = list(nlp.pipe(texts))
with sys_loc.open("w", encoding="utf8") as out_file:
write_conllu(docs, out_file)
with gold_loc.open("r", encoding="utf8") as gold_file:
gold_ud = conll17_ud_eval.load_conllu(gold_file)
with sys_loc.open("r", encoding="utf8") as sys_file:
sys_ud = conll17_ud_eval.load_conllu(sys_file)
scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
return scores
def write_conllu(docs, file_):
merger = Matcher(docs[0].vocab)
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs):
matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches]
offsets = [(span.start_char, span.end_char) for span in spans]
for start_char, end_char in offsets:
doc.merge(start_char, end_char)
file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
file_.write("# text = {text}\n".format(text=sent.text))
for k, token in enumerate(sent):
file_.write(token._.get_conllu_lines(k) + "\n")
file_.write("\n")
def print_progress(itn, losses, ud_scores):
fields = {
"dep_loss": losses.get("parser", 0.0),
"tag_loss": losses.get("tagger", 0.0),
"words": ud_scores["Words"].f1 * 100,
"sents": ud_scores["Sentences"].f1 * 100,
"tags": ud_scores["XPOS"].f1 * 100,
"uas": ud_scores["UAS"].f1 * 100,
"las": ud_scores["LAS"].f1 * 100,
}
header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"]
if itn == 0:
print("\t".join(header))
tpl = "\t".join(
(
"{:d}",
"{dep_loss:.1f}",
"{las:.1f}",
"{uas:.1f}",
"{tags:.1f}",
"{sents:.1f}",
"{words:.1f}",
)
)
print(tpl.format(itn, **fields))
# def get_sent_conllu(sent, sent_id):
# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]
def get_token_conllu(token, i):
if token._.begins_fused:
n = 1
while token.nbor(n)._.inside_fused:
n += 1
id_ = "%d-%d" % (i, i + n)
lines = [id_, token.text, "_", "_", "_", "_", "_", "_", "_", "_"]
else:
lines = []
if token.head.i == token.i:
head = 0
else:
head = i + (token.head.i - token.i) + 1
fields = [
str(i + 1),
token.text,
token.lemma_,
token.pos_,
token.tag_,
"_",
str(head),
token.dep_.lower(),
"_",
"_",
]
lines.append("\t".join(fields))
return "\n".join(lines)
Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
Token.set_extension("begins_fused", default=False, force=True)
Token.set_extension("inside_fused", default=False, force=True)
##################
# Initialization #
##################
def load_nlp(corpus, config):
lang = corpus.split("_")[0]
nlp = spacy.blank(lang)
if config.vectors:
nlp.vocab.from_disk(config.vectors / "vocab")
return nlp
def initialize_pipeline(nlp, examples, config):
nlp.add_pipe(nlp.create_pipe("parser"))
if config.multitask_tag:
nlp.parser.add_multitask_objective("tag")
if config.multitask_sent:
nlp.parser.add_multitask_objective("sent_start")
nlp.parser.moves.add_action(2, "subtok")
nlp.add_pipe(nlp.create_pipe("tagger"))
for ex in examples:
for tag in ex.gold.tags:
if tag is not None:
nlp.tagger.add_label(tag)
# Replace labels that didn't make the frequency cutoff
actions = set(nlp.parser.labels)
label_set = set([act.split("-")[1] for act in actions if "-" in act])
for ex in examples:
gold = ex.gold
for i, label in enumerate(gold.labels):
if label is not None and label not in label_set:
gold.labels[i] = label.split("||")[0]
return nlp.begin_training(lambda: examples)
########################
# Command line helpers #
########################
@attr.s
class Config(object):
vectors = attr.ib(default=None)
max_doc_length = attr.ib(default=10)
multitask_tag = attr.ib(default=True)
multitask_sent = attr.ib(default=True)
nr_epoch = attr.ib(default=30)
batch_size = attr.ib(default=1000)
dropout = attr.ib(default=0.2)
@classmethod
def load(cls, loc):
with Path(loc).open("r", encoding="utf8") as file_:
cfg = json.load(file_)
return cls(**cfg)
class Dataset(object):
def __init__(self, path, section):
self.path = path
self.section = section
self.conllu = None
self.text = None
for file_path in self.path.iterdir():
name = file_path.parts[-1]
if section in name and name.endswith("conllu"):
self.conllu = file_path
elif section in name and name.endswith("txt"):
self.text = file_path
if self.conllu is None:
msg = "Could not find .txt file in {path} for {section}"
raise IOError(msg.format(section=section, path=path))
if self.text is None:
msg = "Could not find .txt file in {path} for {section}"
self.lang = self.conllu.parts[-1].split("-")[0].split("_")[0]
class TreebankPaths(object):
def __init__(self, ud_path, treebank, **cfg):
self.train = Dataset(ud_path / treebank, "train")
self.dev = Dataset(ud_path / treebank, "dev")
self.lang = self.train.lang
@plac.annotations(
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
corpus=(
"UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
"positional",
None,
str,
),
parses_dir=("Directory to write the development parses", "positional", None, Path),
config=("Path to json formatted config file", "positional", None, Config.load),
limit=("Size limit", "option", "n", int),
)
def main(ud_dir, parses_dir, config, corpus, limit=0):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
paths = TreebankPaths(ud_dir, corpus)
if not (parses_dir / corpus).exists():
(parses_dir / corpus).mkdir()
print("Train and evaluate", corpus, "using lang", paths.lang)
nlp = load_nlp(paths.lang, config)
examples = read_data(
nlp,
paths.train.conllu.open(encoding="utf8"),
paths.train.text.open(encoding="utf8"),
max_doc_length=config.max_doc_length,
limit=limit,
)
optimizer = initialize_pipeline(nlp, examples, config)
for i in range(config.nr_epoch):
docs = [nlp.make_doc(example.doc.text) for example in examples]
batches = minibatch_by_words(examples, size=config.batch_size)
losses = {}
n_train_words = sum(len(doc) for doc in docs)
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
for batch in batches:
pbar.update(sum(len(ex.doc) for ex in batch))
nlp.update(
examples=batch,
sgd=optimizer,
drop=config.dropout,
losses=losses,
)
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
with nlp.use_params(optimizer.averages):
scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
print_progress(i, losses, scores)
if __name__ == "__main__":
plac.call(main)