spaCy/spacy/_gold/corpus.py
Matthew Honnibal a663d44b1b Add GoldCorpus
2020-06-06 14:28:37 +02:00

278 lines
9.7 KiB
Python

import random
import shutil
import tempfile
import srsly
from pathlib import Path
import itertools
from ..tokens import Doc
from .. import util
from ..errors import Errors
from .gold_utils import read_json_file, read_json_object
from .augment import make_orth_variants, add_noise
from .exmaple import Example
class GoldCorpus(object):
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER.
DOCS: https://spacy.io/api/goldcorpus
"""
def __init__(self, train, dev, gold_preproc=False, limit=None):
"""Create a GoldCorpus.
train (str / Path): File or directory of training data.
dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
"""
self.limit = limit
if isinstance(train, str) or isinstance(train, Path):
train = self.read_examples(self.walk_corpus(train))
dev = self.read_examples(self.walk_corpus(dev))
# Write temp directory with one doc per file, so we can shuffle and stream
self.tmp_dir = Path(tempfile.mkdtemp())
self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
def __del__(self):
shutil.rmtree(self.tmp_dir)
@staticmethod
def write_msgpack(directory, examples, limit=0):
if not directory.exists():
directory.mkdir()
n = 0
for i, example in enumerate(examples):
ex_dict = example.to_dict()
text = example.text
srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
n += 1
if limit and n >= limit:
break
@staticmethod
def walk_corpus(path):
path = util.ensure_path(path)
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith((".json", ".jsonl")):
locs.append(path)
return locs
@staticmethod
def read_examples(locs, limit=0):
""" Yield training examples """
i = 0
for loc in locs:
loc = util.ensure_path(loc)
file_name = loc.parts[-1]
if file_name.endswith("json"):
examples = read_json_file(loc)
elif file_name.endswith("jsonl"):
gold_tuples = srsly.read_jsonl(loc)
first_gold_tuple = next(gold_tuples)
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
# TODO: proper format checks with schemas
if isinstance(first_gold_tuple, dict):
if first_gold_tuple.get("paragraphs", None):
examples = read_json_object(gold_tuples)
elif first_gold_tuple.get("doc_annotation", None):
examples = []
for ex_dict in gold_tuples:
doc = ex_dict.get("doc", None)
if doc is None:
doc = ex_dict.get("text", None)
if not (
doc is None
or isinstance(doc, Doc)
or isinstance(doc, str)
):
raise ValueError(Errors.E987.format(type=type(doc)))
examples.append(Example.from_dict(ex_dict, doc=doc))
elif file_name.endswith("msg"):
text, ex_dict = srsly.read_msgpack(loc)
examples = [Example.from_dict(ex_dict, doc=text)]
else:
supported = ("json", "jsonl", "msg")
raise ValueError(Errors.E124.format(path=loc, formats=supported))
try:
for example in examples:
yield example
i += 1
if limit and i >= limit:
return
except KeyError as e:
msg = "Missing key {}".format(e)
raise KeyError(Errors.E996.format(file=file_name, msg=msg))
except UnboundLocalError:
msg = "Unexpected document structure"
raise ValueError(Errors.E996.format(file=file_name, msg=msg))
@property
def dev_examples(self):
locs = (self.tmp_dir / "dev").iterdir()
yield from self.read_examples(locs, limit=self.limit)
@property
def train_examples(self):
locs = (self.tmp_dir / "train").iterdir()
yield from self.read_examples(locs, limit=self.limit)
def count_train(self):
"""Returns count of words in train examples"""
n = 0
i = 0
for example in self.train_examples:
n += len(example.token_annotation.words)
if self.limit and i >= self.limit:
break
i += 1
return n
def train_dataset(
self,
nlp,
gold_preproc=False,
max_length=None,
noise_level=0.0,
orth_variant_level=0.0,
ignore_misaligned=False,
):
locs = list((self.tmp_dir / "train").iterdir())
random.shuffle(locs)
train_examples = self.read_examples(locs, limit=self.limit)
gold_examples = self.iter_gold_docs(
nlp,
train_examples,
gold_preproc,
max_length=max_length,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
make_projective=True,
ignore_misaligned=ignore_misaligned,
)
yield from gold_examples
def train_dataset_without_preprocessing(
self, nlp, gold_preproc=False, ignore_misaligned=False
):
examples = self.iter_gold_docs(
nlp,
self.train_examples,
gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned,
)
yield from examples
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
examples = self.iter_gold_docs(
nlp,
self.dev_examples,
gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned,
)
yield from examples
@classmethod
def iter_gold_docs(
cls,
nlp,
examples,
gold_preproc,
max_length=None,
noise_level=0.0,
orth_variant_level=0.0,
make_projective=False,
ignore_misaligned=False,
):
""" Setting gold_preproc will result in creating a doc per sentence """
for example in examples:
if gold_preproc:
split_examples = example.split_sents()
example_golds = []
for split_example in split_examples:
split_example_docs = cls._make_docs(
nlp,
split_example,
gold_preproc,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
)
split_example_golds = cls._make_golds(
split_example_docs,
vocab=nlp.vocab,
make_projective=make_projective,
ignore_misaligned=ignore_misaligned,
)
example_golds.extend(split_example_golds)
else:
example_docs = cls._make_docs(
nlp,
example,
gold_preproc,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
)
example_golds = cls._make_golds(
example_docs,
vocab=nlp.vocab,
make_projective=make_projective,
ignore_misaligned=ignore_misaligned,
)
for ex in example_golds:
if ex.goldparse is not None:
if (not max_length) or len(ex.doc) < max_length:
yield ex
@classmethod
def _make_docs(
cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0
):
var_example = make_orth_variants(
nlp, example, orth_variant_level=orth_variant_level
)
# gold_preproc is not used ?!
if example.text is not None:
var_text = add_noise(var_example.text, noise_level)
var_doc = nlp.make_doc(var_text)
var_example.doc = var_doc
else:
var_doc = Doc(
nlp.vocab,
words=add_noise(var_example.token_annotation.words, noise_level),
)
var_example.doc = var_doc
return [var_example]
@classmethod
def _make_golds(
cls, examples, vocab=None, make_projective=False, ignore_misaligned=False
):
filtered_examples = []
for example in examples:
gold_parses = example.get_gold_parses(
vocab=vocab,
make_projective=make_projective,
ignore_misaligned=ignore_misaligned,
)
assert len(gold_parses) == 1
doc, gold = gold_parses[0]
if doc:
assert doc == example.doc
example.goldparse = gold
filtered_examples.append(example)
return filtered_examples