spaCy/spacy/gold.pyx

# cython: profile=True
# coding: utf8
from __future__ import unicode_literals, print_function

import re
import random
import numpy
import tempfile
import shutil
import itertools
from pathlib import Path
import srsly

from .syntax import nonproj
from .tokens import Doc, Span
from .errors import Errors, AlignmentError
from .compat import path2str, basestring_
from . import util


USE_NEW_ALIGN = False
punct_re = re.compile(r"\W")


def tags_to_entities(tags):
    entities = []
    start = None
    for i, tag in enumerate(tags):
        if tag is None:
            continue
        if tag.startswith("O"):
            # TODO: We shouldn't be getting these malformed inputs. Fix this.
            if start is not None:
                start = None
            continue
        elif tag == "-":
            continue
        elif tag.startswith("I"):
            if start is None:
                raise ValueError(Errors.E067.format(tags=tags[:i + 1]))
            continue
        if tag.startswith("U"):
            entities.append((tag[2:], i, i))
        elif tag.startswith("B"):
            start = i
        elif tag.startswith("L"):
            entities.append((tag[2:], start, i))
            start = None
        else:
            raise ValueError(Errors.E068.format(tag=tag))
    return entities


_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]


def _normalize_for_alignment(tokens):
    tokens = [w.replace(" ", "").lower() for w in tokens]
    output = []
    for token in tokens:
        token = token.replace(" ", "").lower()
        for before, after in _ALIGNMENT_NORM_MAP:
            token = token.replace(before, after)
        output.append(token)
    return output


def _align_before_v2_2_2(tokens_a, tokens_b):
    """Calculate alignment tables between two tokenizations, using the Levenshtein
    algorithm. The alignment is case-insensitive.

    tokens_a (List[str]): The candidate tokenization.
    tokens_b (List[str]): The reference tokenization.
    RETURNS: (tuple): A 5-tuple consisting of the following information:
      * cost (int): The number of misaligned tokens.
      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
        it has the value -1.
      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
        the same token of `tokens_b`.
      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
            direction.
    """
    from . import _align
    if tokens_a == tokens_b:
        alignment = numpy.arange(len(tokens_a))
        return 0, alignment, alignment, {}, {}
    tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
    tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
    cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
    i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
                                                        [len(w) for w in tokens_b])
    for i, j in list(i2j_multi.items()):
        if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
            i2j[i] = j
            i2j_multi.pop(i)
    for j, i in list(j2i_multi.items()):
        if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
            j2i[j] = i
            j2i_multi.pop(j)
    return cost, i2j, j2i, i2j_multi, j2i_multi


def align(tokens_a, tokens_b):
    """Calculate alignment tables between two tokenizations.

    tokens_a (List[str]): The candidate tokenization.
    tokens_b (List[str]): The reference tokenization.
    RETURNS: (tuple): A 5-tuple consisting of the following information:
      * cost (int): The number of misaligned tokens.
      * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
        For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
        to `tokens_b[6]`. If there's no one-to-one alignment for a token,
        it has the value -1.
      * b2a (List[int]): The same as `a2b`, but mapping the other direction.
      * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
        to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
        the same token of `tokens_b`.
      * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
            direction.
    """
    if not USE_NEW_ALIGN:
        return _align_before_v2_2_2(tokens_a, tokens_b)
    tokens_a = _normalize_for_alignment(tokens_a)
    tokens_b = _normalize_for_alignment(tokens_b)
    cost = 0
    a2b = numpy.empty(len(tokens_a), dtype="i")
    b2a = numpy.empty(len(tokens_b), dtype="i")
    a2b_multi = {}
    b2a_multi = {}
    i = 0
    j = 0
    offset_a = 0
    offset_b = 0
    while i < len(tokens_a) and j < len(tokens_b):
        a = tokens_a[i][offset_a:]
        b = tokens_b[j][offset_b:]
        a2b[i] =  b2a[j] = -1
        if a == b:
            if offset_a == offset_b == 0:
                a2b[i] = j
                b2a[j] = i
            elif offset_a == 0:
                cost += 2
                a2b_multi[i] = j
            elif offset_b == 0:
                cost += 2
                b2a_multi[j] = i
            offset_a = offset_b = 0
            i += 1
            j += 1
        elif a == "":
            assert offset_a == 0
            cost += 1
            i += 1
        elif b == "":
            assert offset_b == 0
            cost += 1
            j += 1
        elif b.startswith(a):
            cost += 1
            if offset_a == 0:
                a2b_multi[i] = j
            i += 1
            offset_a = 0
            offset_b += len(a)
        elif a.startswith(b):
            cost += 1
            if offset_b == 0:
                b2a_multi[j] = i
            j += 1
            offset_b = 0
            offset_a += len(b)
        else:
            assert "".join(tokens_a) != "".join(tokens_b)
            raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
    return cost, a2b, b2a, a2b_multi, b2a_multi


class GoldCorpus(object):
    """An annotated corpus, using the JSON file format. Manages
    annotations for tagging, dependency parsing and NER.

    DOCS: https://spacy.io/api/goldcorpus
    """
    def __init__(self, train, dev, gold_preproc=False, limit=None):
        """Create a GoldCorpus.

        train (unicode or Path): File or directory of training data.
        dev (unicode or Path): File or directory of development data.
        RETURNS (GoldCorpus): The newly created object.
        """
        self.limit = limit
        if isinstance(train, str) or isinstance(train, Path):
            train = self.read_examples(self.walk_corpus(train))
            dev = self.read_examples(self.walk_corpus(dev))
        # Write temp directory with one doc per file, so we can shuffle and stream
        self.tmp_dir = Path(tempfile.mkdtemp())
        self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
        self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)

    def __del__(self):
        shutil.rmtree(path2str(self.tmp_dir))

    @staticmethod
    def write_msgpack(directory, examples, limit=0):
        if not directory.exists():
            directory.mkdir()
        n = 0
        for i, example in enumerate(examples):
            ex_dict = example.to_dict()
            text = example.text
            srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
            n += len(example.token_annotations)
            if limit and n >= limit:
                break

    @staticmethod
    def walk_corpus(path):
        path = util.ensure_path(path)
        if not path.is_dir():
            return [path]
        paths = [path]
        locs = []
        seen = set()
        for path in paths:
            if str(path) in seen:
                continue
            seen.add(str(path))
            if path.parts[-1].startswith("."):
                continue
            elif path.is_dir():
                paths.extend(path.iterdir())
            elif path.parts[-1].endswith((".json", ".jsonl")):
                locs.append(path)
        return locs

    @staticmethod
    def read_examples(locs, limit=0):
        """ Yield training examples """
        i = 0
        for loc in locs:
            loc = util.ensure_path(loc)
            if loc.parts[-1].endswith("json"):
                examples = read_json_file(loc)
            elif loc.parts[-1].endswith("jsonl"):
                gold_tuples = srsly.read_jsonl(loc)
                first_gold_tuple = next(gold_tuples)
                gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
                # TODO: proper format checks with schemas
                if isinstance(first_gold_tuple, dict):
                    if first_gold_tuple.get("paragraphs", None):
                        examples = read_json_object(gold_tuples)
                    elif first_gold_tuple.get("doc_annotation", None):
                        examples = []
                        for ex_dict in gold_tuples:
                            doc = ex_dict.get("doc", None)
                            if doc is None:
                                doc = ex_dict.get("text", None)
                            examples.append(Example.from_dict(ex_dict, doc=doc))

            elif loc.parts[-1].endswith("msg"):
                text, ex_dict = srsly.read_msgpack(loc)
                examples = [Example.from_dict(ex_dict, doc=text)]
            else:
                supported = ("json", "jsonl", "msg")
                raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
            for example in examples:
                yield example
                i += len(example.token_annotations)
                if limit and i >= limit:
                    return

    @property
    def dev_examples(self):
        locs = (self.tmp_dir / "dev").iterdir()
        yield from self.read_examples(locs, limit=self.limit)

    @property
    def train_examples(self):
        locs = (self.tmp_dir / "train").iterdir()
        yield from self.read_examples(locs, limit=self.limit)

    def count_train(self):
        # TODO: should this count words or sentences ?
        n = 0
        i = 0
        for example in self.train_examples:
            for token_annotation in example.token_annotations:
                n += len(token_annotation.words)
                if self.limit and i >= self.limit:
                    break
                i += 1
        return n

    def train_dataset(self, nlp, gold_preproc=False, max_length=None,
                    noise_level=0.0, orth_variant_level=0.0,
                    ignore_misaligned=False):
        locs = list((self.tmp_dir / 'train').iterdir())
        random.shuffle(locs)
        train_examples = self.read_examples(locs, limit=self.limit)
        gold_examples = self.iter_gold_docs(nlp, train_examples, gold_preproc,
                                        max_length=max_length,
                                        noise_level=noise_level,
                                        orth_variant_level=orth_variant_level,
                                        make_projective=True,
                                        ignore_misaligned=ignore_misaligned)
        yield from gold_examples

    def train_dataset_without_preprocessing(self, nlp, gold_preproc=False):
        examples = self.iter_gold_docs(nlp, self.train_examples, gold_preproc=gold_preproc)
        yield from examples

    def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
        examples = self.iter_gold_docs(nlp, self.dev_examples, gold_preproc=gold_preproc,
                                        ignore_misaligned=ignore_misaligned)
        yield from examples

    @classmethod
    def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
                       noise_level=0.0, orth_variant_level=0.0, make_projective=False,
                       ignore_misaligned=False):
        """ Setting gold_preproc will result in creating a doc per 'sentence' """
        for example in examples:
            if gold_preproc:
                example.doc = None
            else:
                example = example.merge_sents()
            example.make_projective = make_projective
            example.ignore_misaligned = ignore_misaligned
            examples = cls._make_docs(nlp, example,
                                      gold_preproc, noise_level=noise_level,
                                      orth_variant_level=orth_variant_level)
            examples = cls._make_golds(examples, vocab=nlp.vocab)
            for ex in examples:
                if ex.gold is not None:
                    if (not max_length) or len(ex.doc) < max_length:
                        yield ex

    @classmethod
    def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
        # gold_preproc is not used ?!
        if example.text is not None:
            var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
            var_text = add_noise(var_example.text, noise_level)
            var_doc = nlp.make_doc(var_text)
            var_example.doc = var_doc
            return [var_example]
        else:
            var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
            doc_examples = []
            for token_annotation in var_example.token_annotations:
                t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
                doc_example = Example(doc_annotation=example.doc_annotation,
                                      token_annotations=[token_annotation],
                                      doc=t_doc)
                doc_examples.append(doc_example)
            return doc_examples

    @classmethod
    def _make_golds(cls, examples, vocab=None):
        gold_examples = []
        for example in examples:
            gold_parses = example.get_gold_parses(vocab=vocab)
            for (doc, gold) in gold_parses:
                ex = Example(doc=doc)
                ex.goldparse = gold
                gold_examples.append(ex)
        return gold_examples

def make_orth_variants(nlp, example, orth_variant_level=0.0):
    if random.random() >= orth_variant_level:
        return example
    if not example.token_annotations:
        return example
    raw = example.text
    if random.random() >= 0.5:
        lower = True
        if raw is not None:
            raw = raw.lower()
    ndsv = nlp.Defaults.single_orth_variants
    ndpv = nlp.Defaults.paired_orth_variants
    # modify words in paragraph_tuples
    variant_example = Example(doc=raw)
    for token_annotation in example.token_annotations:
        words = token_annotation.words
        tags = token_annotation.tags
        if not words or not tags:
           # add the unmodified annotation
            token_dict = token_annotation.to_dict()
            variant_example.add_token_annotation(**token_dict)
        else:
            if lower:
                words = [w.lower() for w in words]
            # single variants
            punct_choices = [random.choice(x["variants"]) for x in ndsv]
            for word_idx in range(len(words)):
                for punct_idx in range(len(ndsv)):
                    if tags[word_idx] in ndsv[punct_idx]["tags"] \
                            and words[word_idx] in ndsv[punct_idx]["variants"]:
                        words[word_idx] = punct_choices[punct_idx]
            # paired variants
            punct_choices = [random.choice(x["variants"]) for x in ndpv]
            for word_idx in range(len(words)):
                for punct_idx in range(len(ndpv)):
                    if tags[word_idx] in ndpv[punct_idx]["tags"] \
                            and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
                        # backup option: random left vs. right from pair
                        pair_idx = random.choice([0, 1])
                        # best option: rely on paired POS tags like `` / ''
                        if len(ndpv[punct_idx]["tags"]) == 2:
                            pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
                        # next best option: rely on position in variants
                        # (may not be unambiguous, so order of variants matters)
                        else:
                            for pair in ndpv[punct_idx]["variants"]:
                                if words[word_idx] in pair:
                                    pair_idx = pair.index(words[word_idx])
                        words[word_idx] = punct_choices[punct_idx][pair_idx]

            token_dict = token_annotation.to_dict()
            token_dict["words"] = words
            token_dict["tags"] = tags
            variant_example.add_token_annotation(**token_dict)
    # modify raw to match variant_paragraph_tuples
    if raw is not None:
        variants = []
        for single_variants in ndsv:
            variants.extend(single_variants["variants"])
        for paired_variants in ndpv:
            variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
        # store variants in reverse length order to be able to prioritize
        # longer matches (e.g., "---" before "--")
        variants = sorted(variants, key=lambda x: len(x))
        variants.reverse()
        variant_raw = ""
        raw_idx = 0
        # add initial whitespace
        while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
            variant_raw += raw[raw_idx]
            raw_idx += 1
        for token_annotation in variant_example.token_annotations:
            for word in token_annotation.words:
                match_found = False
                # add identical word
                if word not in variants and raw[raw_idx:].startswith(word):
                    variant_raw += word
                    raw_idx += len(word)
                    match_found = True
                # add variant word
                else:
                    for variant in variants:
                        if not match_found and \
                                raw[raw_idx:].startswith(variant):
                            raw_idx += len(variant)
                            variant_raw += word
                            match_found = True
                # something went wrong, abort
                # (add a warning message?)
                if not match_found:
                    return example
                # add following whitespace
                while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
                    variant_raw += raw[raw_idx]
                    raw_idx += 1
        variant_example.doc = variant_raw
        return variant_example
    return variant_example


def add_noise(orig, noise_level):
    if random.random() >= noise_level:
        return orig
    elif type(orig) == list:
        corrupted = [_corrupt(word, noise_level) for word in orig]
        corrupted = [w for w in corrupted if w]
        return corrupted
    else:
        return "".join(_corrupt(c, noise_level) for c in orig)


def _corrupt(c, noise_level):
    if random.random() >= noise_level:
        return c
    elif c in [".", "'", "!", "?", ","]:
        return "\n"
    else:
        return c.lower()


def read_json_object(json_corpus_section):
    """Take a list of JSON-formatted documents (e.g. from an already loaded
    training data file) and yield annotations in the GoldParse format.

    json_corpus_section (list): The data.
    YIELDS (Example): The reformatted data - one training example per paragraph
    """
    for json_doc in json_corpus_section:
        examples = json_to_examples(json_doc)
        for ex in examples:
            yield ex


def json_to_examples(doc):
    """Convert an item in the JSON-formatted training data to the format
    used by GoldParse.

    doc (dict): One entry in the training data.
    YIELDS (Example): The reformatted data - one training example per paragraph
    """
    paragraphs = []
    for paragraph in doc["paragraphs"]:
        example = Example(doc=paragraph.get("raw", None))
        for sent in paragraph["sentences"]:
            words = []
            ids = []
            tags = []
            heads = []
            labels = []
            ner = []
            for i, token in enumerate(sent["tokens"]):
                words.append(token["orth"])
                ids.append(i)
                tags.append(token.get('tag', "-"))
                heads.append(token.get("head", 0) + i)
                labels.append(token.get("dep", ""))
                # Ensure ROOT label is case-insensitive
                if labels[-1].lower() == "root":
                    labels[-1] = "ROOT"
                ner.append(token.get("ner", "-"))
            example.add_token_annotation(ids=ids, words=words, tags=tags,
                                        heads=heads, deps=labels, entities=ner,
                                        brackets=sent.get("brackets", []))
        cats = {}
        for cat in paragraph.get("cats", {}):
            cats[cat["label"]] = cat["value"]
        example.add_doc_annotation(cats=cats)
        yield example


def read_json_file(loc, docs_filter=None, limit=None):
    loc = util.ensure_path(loc)
    if loc.is_dir():
        for filename in loc.iterdir():
            yield from read_json_file(loc / filename, limit=limit)
    else:
        for doc in _json_iterate(loc):
            if docs_filter is not None and not docs_filter(doc):
                continue
            for json_data in json_to_examples(doc):
                yield json_data


def _json_iterate(loc):
    # We should've made these files jsonl...But since we didn't, parse out
    # the docs one-by-one to reduce memory usage.
    # It's okay to read in the whole file -- just don't parse it into JSON.
    cdef bytes py_raw
    loc = util.ensure_path(loc)
    with loc.open("rb") as file_:
        py_raw = file_.read()
    raw = <char*>py_raw
    cdef int square_depth = 0
    cdef int curly_depth = 0
    cdef int inside_string = 0
    cdef int escape = 0
    cdef int start = -1
    cdef char c
    cdef char quote = ord('"')
    cdef char backslash = ord("\\")
    cdef char open_square = ord("[")
    cdef char close_square = ord("]")
    cdef char open_curly = ord("{")
    cdef char close_curly = ord("}")
    for i in range(len(py_raw)):
        c = raw[i]
        if escape:
            escape = False
            continue
        if c == backslash:
            escape = True
            continue
        if c == quote:
            inside_string = not inside_string
            continue
        if inside_string:
            continue
        if c == open_square:
            square_depth += 1
        elif c == close_square:
            square_depth -= 1
        elif c == open_curly:
            if square_depth == 1 and curly_depth == 0:
                start = i
            curly_depth += 1
        elif c == close_curly:
            curly_depth -= 1
            if square_depth == 1 and curly_depth == 0:
                py_str = py_raw[start : i + 1].decode("utf8")
                try:
                    yield srsly.json_loads(py_str)
                except Exception:
                    print(py_str)
                    raise
                start = -1


def iob_to_biluo(tags):
    out = []
    tags = list(tags)
    while tags:
        out.extend(_consume_os(tags))
        out.extend(_consume_ent(tags))
    return out


def _consume_os(tags):
    while tags and tags[0] == "O":
        yield tags.pop(0)


def _consume_ent(tags):
    if not tags:
        return []
    tag = tags.pop(0)
    target_in = "I" + tag[1:]
    target_last = "L" + tag[1:]
    length = 1
    while tags and tags[0] in {target_in, target_last}:
        length += 1
        tags.pop(0)
    label = tag[2:]
    if length == 1:
        if len(label) == 0:
            raise ValueError(Errors.E177.format(tag=tag))
        return ["U-" + label]
    else:
        start = "B-" + label
        end = "L-" + label
        middle = ["I-%s" % label for _ in range(1, length - 1)]
        return [start] + middle + [end]


cdef class TokenAnnotation:
    def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None):
        self.ids = ids if ids else []
        self.words = words if words else []
        self.tags = tags if tags else []
        self.heads = heads if heads else []
        self.deps = deps if deps else []
        self.entities = entities if entities else []
        self.brackets = brackets if brackets else []
        self.morphology = morphology if morphology else []

    @classmethod
    def from_dict(cls, token_dict):
        return cls(ids=token_dict.get("ids", None),
                   words=token_dict.get("words", None),
                   tags=token_dict.get("tags", None),
                   heads=token_dict.get("heads", None),
                   deps=token_dict.get("deps", None),
                   entities=token_dict.get("entities", None),
                   morphology=token_dict.get("morphology", None),
                   brackets=token_dict.get("brackets", None))

    def to_dict(self):
        return {"ids": self.ids,
                "words": self.words,
                "tags": self.tags,
                "heads": self.heads,
                "deps": self.deps,
                "entities": self.entities,
                "morphology": self.morphology,
                "brackets": self.brackets}


cdef class DocAnnotation:
    def __init__(self, cats=None, links=None):
        self.cats = cats if cats else {}
        self.links = links if links else {}

    @classmethod
    def from_dict(cls, doc_dict):
        return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None))

    def to_dict(self):
        return {"cats": self.cats, "links": self.links}


cdef class Example:
    def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
                 make_projective=False, ignore_misaligned=False, goldparse=None):
        """ Doc can either be text, or an actual Doc """
        self.doc = doc
        self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
        self.token_annotations = token_annotations if token_annotations else []
        self.make_projective = make_projective
        self.ignore_misaligned = ignore_misaligned
        self.goldparse = goldparse

    @classmethod
    def from_gold(cls, goldparse, doc=None):
        doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
        token_annotation = goldparse.get_token_annotation()
        return cls(doc_annotation, [token_annotation], doc)

    @classmethod
    def from_dict(cls, example_dict, doc=None):
        token_dicts = example_dict["token_annotations"]
        token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts]
        doc_dict = example_dict["doc_annotation"]
        doc_annotation = DocAnnotation.from_dict(doc_dict)
        return cls(doc_annotation, token_annotations, doc)

    def to_dict(self):
        """ Note that this method does NOT export the doc, only the annotations ! """
        token_dicts = [t.to_dict() for t in self.token_annotations]
        doc_dict = self.doc_annotation.to_dict()
        return {"token_annotations": token_dicts, "doc_annotation": doc_dict}

    @property
    def text(self):
        if self.doc is None:
            return None
        if isinstance(self.doc, Doc):
            return self.doc.text
        return self.doc

    @property
    def gold(self):
        if self.goldparse is None:
            doc, gold = self.get_gold_parses(merge=True)[0]
            self.goldparse = gold
        return self.goldparse

    def add_token_annotation(self, ids=None, words=None, tags=None, heads=None,
                             deps=None, entities=None, morphology=None, brackets=None):
        t = TokenAnnotation(ids=ids, words=words, tags=tags,
                            heads=heads, deps=deps, entities=entities,
                            morphology=morphology, brackets=brackets)
        self.token_annotations.append(t)

    def add_doc_annotation(self, cats=None, links=None):
        if cats:
            self.doc_annotation.cats.update(cats)
        if links:
            self.doc_annotation.links.update(links)

    def merge_sents(self):
        """ Merge the list of token annotations into one object and return this new object """
        m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation)
        m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], []
        m_brackets = []
        i = 0
        for t in self.token_annotations:
            m_ids.extend(id_ + i for id_ in t.ids)
            m_words.extend(t.words)
            m_tags.extend(t.tags)
            m_heads.extend(head + i if head else None for head in t.heads)
            m_deps.extend(t.deps)
            m_ents.extend(t.entities)
            m_morph.extend(t.morphology)
            m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
                              for b in t.brackets)
            i += len(t.ids)
        m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags,
                                       heads=m_heads, deps=m_deps, entities=m_ents,
                                       morphology=m_morph, brackets=m_brackets)
        return m_example


    def get_gold_parses(self, merge=False, vocab=None):
        """Return a list of (doc, GoldParse) objects.
        If merge is set to True, add all Token annotations to one big list."""
        d = self.doc_annotation
        # merging different sentences
        if merge:
            merged_example = self.merge_sents()
            assert(len(merged_example.token_annotations)) == 1
            t = merged_example.token_annotations[0]
            m_doc = merged_example.doc
            if not m_doc:
                if not vocab:
                    raise ValueError(Errors.E998)
                m_doc = Doc(vocab, words=t.words)
            try:
                gp = GoldParse.from_annotation(m_doc, d, t, make_projective=self.make_projective)
            except AlignmentError:
                if self.ignore_misaligned:
                    gp = None
                else:
                    raise
            return [(self.doc, gp)]
        # we only have one sentence and an appropriate doc
        elif len(self.token_annotations) == 1 and self.doc is not None:
            t = self.token_annotations[0]
            try:
                gp = GoldParse.from_annotation(self.doc, d, t, make_projective=self.make_projective)
            except AlignmentError:
                if self.ignore_misaligned:
                    gp = None
                else:
                    raise
            return [(self.doc, gp)]
        # not merging: one GoldParse per 'sentence', defining docs with the words from each sentence
        else:
            parses = []
            for t in self.token_annotations:
                if not vocab:
                    raise ValueError(Errors.E998)
                t_doc = Doc(vocab, words=t.words)
                try:
                    gp = GoldParse.from_annotation(t_doc, d, t, make_projective=self.make_projective)
                except AlignmentError:
                    if self.ignore_misaligned:
                        gp = None
                    else:
                        raise
                if gp is not None:
                    parses.append((t_doc, gp))
            return parses

    @classmethod
    def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
        """
        Return a list of Example objects, from a variety of input formats.
        make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
        """
        if isinstance(examples, Example):
            return [examples]
        if isinstance(examples, tuple):
            examples = [examples]
        converted_examples = []
        for ex in examples:
            # convert string to Doc to Example
            if isinstance(ex, basestring_):
                if keep_raw_text:
                    converted_examples.append(Example(doc=ex))
                else:
                    doc = make_doc(ex)
                    converted_examples.append(Example(doc=doc))
            # convert Doc to Example
            elif isinstance(ex, Doc):
                converted_examples.append(Example(doc=ex))
            # convert tuples to Example
            elif isinstance(ex, tuple) and len(ex) == 2:
                doc, gold = ex
                gold_dict = {}
                # convert string to Doc
                if isinstance(doc, basestring_) and not keep_raw_text:
                    doc = make_doc(doc)
                # convert dict to GoldParse
                if isinstance(gold, dict):
                    gold_dict = gold
                    if doc is not None or gold.get("words", None) is not None:
                        gold = GoldParse(doc, **gold)
                    else:
                        gold = None
                if gold is not None:
                    converted_examples.append(Example.from_gold(goldparse=gold, doc=doc))
                else:
                    raise ValueError(Errors.E999.format(gold_dict=gold_dict))
            else:
                converted_examples.append(ex)
        return converted_examples


cdef class GoldParse:
    """Collection for training annotations.

    DOCS: https://spacy.io/api/goldparse
    """
    @classmethod
    def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
        return cls(doc, words=token_annotation.words, tags=token_annotation.tags,
                   heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities,
                   morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links,
                   make_projective=make_projective)

    def get_token_annotation(self):
        ids = None
        if self.words:
            ids = list(range(len(self.words)))

        return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
                               heads=self.heads, deps=self.labels, entities=self.ner,
                               morphology=self.morphology)

    def __init__(self, doc, words=None, tags=None, morphology=None,
                 heads=None, deps=None, entities=None, make_projective=False,
                 cats=None, links=None):
        """Create a GoldParse. The fields will not be initialized if len(doc) is zero.

        doc (Doc): The document the annotations refer to.
        words (iterable): A sequence of unicode word strings.
        tags (iterable): A sequence of strings, representing tag annotations.
        heads (iterable): A sequence of integers, representing syntactic
            head offsets.
        deps (iterable): A sequence of strings, representing the syntactic
            relation types.
        entities (iterable): A sequence of named entity annotations, either as
            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
            representing the entity positions.
        cats (dict): Labels for text classification. Each key in the dictionary
            may be a string or an int, or a `(start_char, end_char, label)`
            tuple, indicating that the label is applied to only part of the
            document (usually a sentence). Unlike entity annotations, label
            annotations can overlap, i.e. a single word can be covered by
            multiple labelled spans. The TextCategorizer component expects
            true examples of a label to have the value 1.0, and negative
            examples of a label to have the value 0.0. Labels not in the
            dictionary are treated as missing - the gradient for those labels
            will be zero.
        links (dict): A dict with `(start_char, end_char)` keys,
            and the values being dicts with kb_id:value entries,
            representing the external IDs in a knowledge base (KB)
            mapped to either 1.0 or 0.0, indicating positive and
            negative examples respectively.
        RETURNS (GoldParse): The newly constructed object.
        """
        self.mem = Pool()
        self.loss = 0
        self.length = len(doc)

        self.cats = {} if cats is None else dict(cats)
        self.links = {} if links is None else dict(links)

        # avoid allocating memory if the doc does not contain any tokens
        if self.length > 0:
            if not words:
                words = [token.text for token in doc]
            if not tags:
                tags = [None for _ in words]
            if not heads:
                heads = [None for _ in words]
            if not deps:
                deps = [None for _ in words]
            if not morphology:
                morphology = [None for _ in words]
            if entities is None:
                entities = ["-" for _ in words]
            elif len(entities) == 0:
                entities = ["O" for _ in words]
            else:
                # Translate the None values to '-', to make processing easier.
                # See Issue #2603
                entities = [(ent if ent is not None else "-") for ent in entities]
                if not isinstance(entities[0], basestring_):
                    # Assume we have entities specified by character offset.
                    entities = biluo_tags_from_offsets(doc, entities)

            # These are filled by the tagger/parser/entity recogniser
            self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
            self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))

            self.words = [None] * len(doc)
            self.tags = [None] * len(doc)
            self.heads = [None] * len(doc)
            self.labels = [None] * len(doc)
            self.ner = [None] * len(doc)
            self.morphology = [None] * len(doc)

            # This needs to be done before we align the words
            if make_projective and heads is not None and deps is not None:
                heads, deps = nonproj.projectivize(heads, deps)

            # Do many-to-one alignment for misaligned tokens.
            # If we over-segment, we'll have one gold word that covers a sequence
            # of predicted words
            # If we under-segment, we'll have one predicted word that covers a
            # sequence of gold words.
            # If we "mis-segment", we'll have a sequence of predicted words covering
            # a sequence of gold words. That's many-to-many -- we don't do that.
            cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)

            self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
            self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]

            self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
                                        heads=heads, deps=deps, entities=entities, morphology=morphology,
                                        brackets=[])

            for i, gold_i in enumerate(self.cand_to_gold):
                if doc[i].text.isspace():
                    self.words[i] = doc[i].text
                    self.tags[i] = "_SP"
                    self.heads[i] = None
                    self.labels[i] = None
                    self.ner[i] = None
                    self.morphology[i] = set()
                if gold_i is None:
                    if i in i2j_multi:
                        self.words[i] = words[i2j_multi[i]]
                        self.tags[i] = tags[i2j_multi[i]]
                        self.morphology[i] = morphology[i2j_multi[i]]
                        is_last = i2j_multi[i] != i2j_multi.get(i+1)
                        is_first = i2j_multi[i] != i2j_multi.get(i-1)
                        # Set next word in multi-token span as head, until last
                        if not is_last:
                            self.heads[i] = i+1
                            self.labels[i] = "subtok"
                        else:
                            head_i = heads[i2j_multi[i]]
                            if head_i:
                                self.heads[i] = self.gold_to_cand[head_i]
                            self.labels[i] = deps[i2j_multi[i]]
                        # Now set NER...This is annoying because if we've split
                        # got an entity word split into two, we need to adjust the
                        # BILUO tags. We can't have BB or LL etc.
                        # Case 1: O -- easy.
                        ner_tag = entities[i2j_multi[i]]
                        if ner_tag == "O":
                            self.ner[i] = "O"
                        # Case 2: U. This has to become a B I* L sequence.
                        elif ner_tag.startswith("U-"):
                            if is_first:
                                self.ner[i] = ner_tag.replace("U-", "B-", 1)
                            elif is_last:
                                self.ner[i] = ner_tag.replace("U-", "L-", 1)
                            else:
                                self.ner[i] = ner_tag.replace("U-", "I-", 1)
                        # Case 3: L. If not last, change to I.
                        elif ner_tag.startswith("L-"):
                            if is_last:
                                self.ner[i] = ner_tag
                            else:
                                self.ner[i] = ner_tag.replace("L-", "I-", 1)
                        # Case 4: I. Stays correct
                        elif ner_tag.startswith("I-"):
                            self.ner[i] = ner_tag
                else:
                    self.words[i] = words[gold_i]
                    self.tags[i] = tags[gold_i]
                    self.morphology[i] = morphology[gold_i]
                    if heads[gold_i] is None:
                        self.heads[i] = None
                    else:
                        self.heads[i] = self.gold_to_cand[heads[gold_i]]
                    self.labels[i] = deps[gold_i]
                    self.ner[i] = entities[gold_i]

            # Prevent whitespace that isn't within entities from being tagged as
            # an entity.
            for i in range(len(self.ner)):
                if self.tags[i] == "_SP":
                    prev_ner = self.ner[i-1] if i >= 1 else None
                    next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
                    if prev_ner == "O" or next_ner == "O":
                        self.ner[i] = "O"

            cycle = nonproj.contains_cycle(self.heads)
            if cycle is not None:
                raise ValueError(Errors.E069.format(cycle=cycle,
                    cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
                    doc_tokens=" ".join(words[:50])))

    def __len__(self):
        """Get the number of gold-standard tokens.

        RETURNS (int): The number of gold-standard tokens.
        """
        return self.length

    @property
    def is_projective(self):
        """Whether the provided syntactic annotations form a projective
        dependency tree.
        """
        return not nonproj.is_nonproj_tree(self.heads)

    property sent_starts:
        def __get__(self):
            return [self.c.sent_start[i] for i in range(self.length)]

        def __set__(self, sent_starts):
            for gold_i, is_sent_start in enumerate(sent_starts):
                i = self.gold_to_cand[gold_i]
                if i is not None:
                    if is_sent_start in (1, True):
                        self.c.sent_start[i] = 1
                    elif is_sent_start in (-1, False):
                        self.c.sent_start[i] = -1
                    else:
                        self.c.sent_start[i] = 0


def docs_to_json(docs, id=0):
    """Convert a list of Doc objects into the JSON-serializable format used by
    the spacy train command.

    docs (iterable / Doc): The Doc object(s) to convert.
    id (int): Id for the JSON.
    RETURNS (dict): The data in spaCy's JSON format
        - each input doc will be treated as a paragraph in the output doc
    """
    if isinstance(docs, Doc):
        docs = [docs]
    json_doc = {"id": id, "paragraphs": []}
    for i, doc in enumerate(docs):
        json_para = {'raw': doc.text, "sentences": [], "cats": []}
        for cat, val in doc.cats.items():
            json_cat = {"label": cat, "value": val}
            json_para["cats"].append(json_cat)
        ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
        biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
        for j, sent in enumerate(doc.sents):
            json_sent = {"tokens": [], "brackets": []}
            for token in sent:
                json_token = {"id": token.i, "orth": token.text}
                if doc.is_tagged:
                    json_token["tag"] = token.tag_
                if doc.is_parsed:
                    json_token["head"] = token.head.i-token.i
                    json_token["dep"] = token.dep_
                json_token["ner"] = biluo_tags[token.i]
                json_sent["tokens"].append(json_token)
            json_para["sentences"].append(json_sent)
        json_doc["paragraphs"].append(json_para)
    return json_doc


def biluo_tags_from_offsets(doc, entities, missing="O"):
    """Encode labelled spans into per-token tags, using the
    Begin/In/Last/Unit/Out scheme (BILUO).

    doc (Doc): The document that the entity offsets refer to. The output tags
        will refer to the token boundaries within the document.
    entities (iterable): A sequence of `(start, end, label)` triples. `start`
        and `end` should be character-offset integers denoting the slice into
        the original string.
    RETURNS (list): A list of unicode strings, describing the tags. Each tag
        string will be of the form either "", "O" or "{action}-{label}", where
        action is one of "B", "I", "L", "U". The string "-" is used where the
        entity offsets don't align with the tokenization in the `Doc` object.
        The training algorithm will view these as missing values. "O" denotes a
        non-entity token. "B" denotes the beginning of a multi-token entity,
        "I" the inside of an entity of three or more tokens, and "L" the end
        of an entity of two or more tokens. "U" denotes a single-token entity.

    EXAMPLE:
        >>> text = 'I like London.'
        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
        >>> doc = nlp.tokenizer(text)
        >>> tags = biluo_tags_from_offsets(doc, entities)
        >>> assert tags == ["O", "O", 'U-LOC', "O"]
    """
    # Ensure no overlapping entity labels exist
    tokens_in_ents = {}

    starts = {token.idx: token.i for token in doc}
    ends = {token.idx + len(token): token.i for token in doc}
    biluo = ["-" for _ in doc]
    # Handle entity cases
    for start_char, end_char, label in entities:
        for token_index in range(start_char, end_char):
            if token_index in tokens_in_ents.keys():
                raise ValueError(Errors.E103.format(
                    span1=(tokens_in_ents[token_index][0],
                            tokens_in_ents[token_index][1],
                            tokens_in_ents[token_index][2]),
                    span2=(start_char, end_char, label)))
            tokens_in_ents[token_index] = (start_char, end_char, label)

        start_token = starts.get(start_char)
        end_token = ends.get(end_char)
        # Only interested if the tokenization is correct
        if start_token is not None and end_token is not None:
            if start_token == end_token:
                biluo[start_token] = "U-%s" % label
            else:
                biluo[start_token] = "B-%s" % label
                for i in range(start_token+1, end_token):
                    biluo[i] = "I-%s" % label
                biluo[end_token] = "L-%s" % label
    # Now distinguish the O cases from ones where we miss the tokenization
    entity_chars = set()
    for start_char, end_char, label in entities:
        for i in range(start_char, end_char):
            entity_chars.add(i)
    for token in doc:
        for i in range(token.idx, token.idx + len(token)):
            if i in entity_chars:
                break
        else:
            biluo[token.i] = missing
    return biluo


def spans_from_biluo_tags(doc, tags):
    """Encode per-token tags following the BILUO scheme into Span object, e.g.
    to overwrite the doc.ents.

    doc (Doc): The document that the BILUO tags refer to.
    entities (iterable): A sequence of BILUO tags with each tag describing one
        token. Each tags string will be of the form of either "", "O" or
        "{action}-{label}", where action is one of "B", "I", "L", "U".
    RETURNS (list): A sequence of Span objects.
    """
    token_offsets = tags_to_entities(tags)
    spans = []
    for label, start_idx, end_idx in token_offsets:
        span = Span(doc, start_idx, end_idx + 1, label=label)
        spans.append(span)
    return spans


def offsets_from_biluo_tags(doc, tags):
    """Encode per-token tags following the BILUO scheme into entity offsets.

    doc (Doc): The document that the BILUO tags refer to.
    entities (iterable): A sequence of BILUO tags with each tag describing one
        token. Each tags string will be of the form of either "", "O" or
        "{action}-{label}", where action is one of "B", "I", "L", "U".
    RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
        `end` will be character-offset integers denoting the slice into the
        original string.
    """
    spans = spans_from_biluo_tags(doc, tags)
    return [(span.start_char, span.end_char, span.label_) for span in spans]


def is_punct_label(label):
    return label == "P" or label.lower() == "punct"