spaCy/spacy/syntax/gold_parse.pyx

# cython: profile=True
import re
import random
import numpy
import tempfile
import shutil
import itertools
from pathlib import Path
import srsly
import warnings

from .. import util
from ..syntax import nonproj
from ..tokens import Doc, Span
from ..errors import Errors, AlignmentError, Warnings
from .iob_utils import offsets_from_biluo_tags
from .align import align


punct_re = re.compile(r"\W")

def is_punct_label(label):
    return label == "P" or label.lower() == "punct"


cdef class GoldParse:
    """Collection for training annotations.

    DOCS: https://spacy.io/api/goldparse
    """
    @classmethod
    def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
        return cls(doc, words=token_annotation.words,
                   tags=token_annotation.tags,
                   pos=token_annotation.pos,
                   morphs=token_annotation.morphs,
                   lemmas=token_annotation.lemmas,
                   heads=token_annotation.heads,
                   deps=token_annotation.deps,
                   entities=token_annotation.entities,
                   sent_starts=token_annotation.sent_starts,
                   cats=doc_annotation.cats,
                   links=doc_annotation.links,
                   make_projective=make_projective)

    def get_token_annotation(self):
        ids = None
        if self.words:
            ids = list(range(len(self.words)))

        return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
                               pos=self.pos, morphs=self.morphs,
                               lemmas=self.lemmas, heads=self.heads,
                               deps=self.labels, entities=self.ner,
                               sent_starts=self.sent_starts)

    def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
                 lemmas=None, heads=None, deps=None, entities=None,
                 sent_starts=None, make_projective=False, cats=None,
                 links=None):
        """Create a GoldParse. The fields will not be initialized if len(doc) is zero.

        doc (Doc): The document the annotations refer to.
        words (iterable): A sequence of unicode word strings.
        tags (iterable): A sequence of strings, representing tag annotations.
        pos (iterable): A sequence of strings, representing UPOS annotations.
        morphs (iterable): A sequence of strings, representing morph
            annotations.
        lemmas (iterable): A sequence of strings, representing lemma
            annotations.
        heads (iterable): A sequence of integers, representing syntactic
            head offsets.
        deps (iterable): A sequence of strings, representing the syntactic
            relation types.
        entities (iterable): A sequence of named entity annotations, either as
            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
            representing the entity positions.
        sent_starts (iterable): A sequence of sentence position tags, 1 for
            the first word in a sentence, 0 for all others.
        cats (dict): Labels for text classification. Each key in the dictionary
            may be a string or an int, or a `(start_char, end_char, label)`
            tuple, indicating that the label is applied to only part of the
            document (usually a sentence). Unlike entity annotations, label
            annotations can overlap, i.e. a single word can be covered by
            multiple labelled spans. The TextCategorizer component expects
            true examples of a label to have the value 1.0, and negative
            examples of a label to have the value 0.0. Labels not in the
            dictionary are treated as missing - the gradient for those labels
            will be zero.
        links (dict): A dict with `(start_char, end_char)` keys,
            and the values being dicts with kb_id:value entries,
            representing the external IDs in a knowledge base (KB)
            mapped to either 1.0 or 0.0, indicating positive and
            negative examples respectively.
        RETURNS (GoldParse): The newly constructed object.
        """
        self.mem = Pool()
        self.loss = 0
        self.length = len(doc)

        self.cats = {} if cats is None else dict(cats)
        self.links = {} if links is None else dict(links)

        # temporary doc for aligning entity annotation
        entdoc = None

        # avoid allocating memory if the doc does not contain any tokens
        if self.length == 0:
            self.words = []
            self.tags = []
            self.heads = []
            self.labels = []
            self.ner = []
            self.morphs = []
            # set a minimal orig so that the scorer can score an empty doc
            self.orig = TokenAnnotation(ids=[])
        else:
            if not words:
                words = [token.text for token in doc]
            if not tags:
                tags = [None for _ in words]
            if not pos:
                pos = [None for _ in words]
            if not morphs:
                morphs = [None for _ in words]
            if not lemmas:
                lemmas = [None for _ in words]
            if not heads:
                heads = [None for _ in words]
            if not deps:
                deps = [None for _ in words]
            if not sent_starts:
                sent_starts = [None for _ in words]
            if entities is None:
                entities = ["-" for _ in words]
            elif len(entities) == 0:
                entities = ["O" for _ in words]
            else:
                # Translate the None values to '-', to make processing easier.
                # See Issue #2603
                entities = [(ent if ent is not None else "-") for ent in entities]
                if not isinstance(entities[0], str):
                    # Assume we have entities specified by character offset.
                    # Create a temporary Doc corresponding to provided words
                    # (to preserve gold tokenization) and text (to preserve
                    # character offsets).
                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
                    entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
                    # There may be some additional whitespace tokens in the
                    # temporary doc, so check that the annotations align with
                    # the provided words while building a list of BILUO labels.
                    entities = []
                    words_offset = 0
                    for i in range(len(entdoc_words)):
                        if words[i + words_offset] == entdoc_words[i]:
                            entities.append(entdoc_entities[i])
                        else:
                            words_offset -= 1
                    if len(entities) != len(words):
                        warnings.warn(Warnings.W029.format(text=doc.text))
                        entities = ["-" for _ in words]

            # These are filled by the tagger/parser/entity recogniser
            self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
            self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))

            self.words = [None] * len(doc)
            self.tags = [None] * len(doc)
            self.pos = [None] * len(doc)
            self.morphs = [None] * len(doc)
            self.lemmas = [None] * len(doc)
            self.heads = [None] * len(doc)
            self.labels = [None] * len(doc)
            self.ner = [None] * len(doc)
            self.sent_starts = [None] * len(doc)

            # This needs to be done before we align the words
            if make_projective and any(heads) and any(deps) :
                heads, deps = nonproj.projectivize(heads, deps)

            # Do many-to-one alignment for misaligned tokens.
            # If we over-segment, we'll have one gold word that covers a sequence
            # of predicted words
            # If we under-segment, we'll have one predicted word that covers a
            # sequence of gold words.
            # If we "mis-segment", we'll have a sequence of predicted words covering
            # a sequence of gold words. That's many-to-many -- we don't do that
            # except for NER spans where the start and end can be aligned.
            cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)

            self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
            self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]

            self.orig = TokenAnnotation(ids=list(range(len(words))),
                    words=words, tags=tags, pos=pos, morphs=morphs,
                    lemmas=lemmas, heads=heads, deps=deps, entities=entities,
                    sent_starts=sent_starts, brackets=[])

            for i, gold_i in enumerate(self.cand_to_gold):
                if doc[i].text.isspace():
                    self.words[i] = doc[i].text
                    self.tags[i] = "_SP"
                    self.pos[i] = "SPACE"
                    self.morphs[i] = None
                    self.lemmas[i] = None
                    self.heads[i] = None
                    self.labels[i] = None
                    self.ner[i] = None
                    self.sent_starts[i] = 0
                if gold_i is None:
                    if i in i2j_multi:
                        self.words[i] = words[i2j_multi[i]]
                        self.tags[i] = tags[i2j_multi[i]]
                        self.pos[i] = pos[i2j_multi[i]]
                        self.morphs[i] = morphs[i2j_multi[i]]
                        self.lemmas[i] = lemmas[i2j_multi[i]]
                        self.sent_starts[i] = sent_starts[i2j_multi[i]]
                        is_last = i2j_multi[i] != i2j_multi.get(i+1)
                        # Set next word in multi-token span as head, until last
                        if not is_last:
                            self.heads[i] = i+1
                            self.labels[i] = "subtok"
                        else:
                            head_i = heads[i2j_multi[i]]
                            if head_i:
                                self.heads[i] = self.gold_to_cand[head_i]
                            self.labels[i] = deps[i2j_multi[i]]
                        ner_tag = entities[i2j_multi[i]]
                        # Assign O/- for many-to-one O/- NER tags
                        if ner_tag in ("O", "-"):
                             self.ner[i] = ner_tag
                else:
                    self.words[i] = words[gold_i]
                    self.tags[i] = tags[gold_i]
                    self.pos[i] = pos[gold_i]
                    self.morphs[i] = morphs[gold_i]
                    self.lemmas[i] = lemmas[gold_i]
                    self.sent_starts[i] = sent_starts[gold_i]
                    if heads[gold_i] is None:
                        self.heads[i] = None
                    else:
                        self.heads[i] = self.gold_to_cand[heads[gold_i]]
                    self.labels[i] = deps[gold_i]
                    self.ner[i] = entities[gold_i]
            # Assign O/- for one-to-many O/- NER tags
            for j, cand_j in enumerate(self.gold_to_cand):
                if cand_j is None:
                    if j in j2i_multi:
                        i = j2i_multi[j]
                        ner_tag = entities[j]
                        if ner_tag in ("O", "-"):
                            self.ner[i] = ner_tag

            # If there is entity annotation and some tokens remain unaligned,
            # align all entities at the character level to account for all
            # possible token misalignments within the entity spans
            if any([e not in ("O", "-") for e in entities]) and None in self.ner:
                # If the temporary entdoc wasn't created above, initialize it
                if not entdoc:
                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
                # Get offsets based on gold words and BILUO entities
                entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
                aligned_offsets = []
                aligned_spans = []
                # Filter offsets to identify those that align with doc tokens
                for offset in entdoc_offsets:
                    span = doc.char_span(offset[0], offset[1])
                    if span and not span.text.isspace():
                        aligned_offsets.append(offset)
                        aligned_spans.append(span)
                # Convert back to BILUO for doc tokens and assign NER for all
                # aligned spans
                biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
                for span in aligned_spans:
                    for i in range(span.start, span.end):
                        self.ner[i] = biluo_tags[i]

            # Prevent whitespace that isn't within entities from being tagged as
            # an entity.
            for i in range(len(self.ner)):
                if self.tags[i] == "_SP":
                    prev_ner = self.ner[i-1] if i >= 1 else None
                    next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
                    if prev_ner == "O" or next_ner == "O":
                        self.ner[i] = "O"

            cycle = nonproj.contains_cycle(self.heads)
            if cycle is not None:
                raise ValueError(Errors.E069.format(cycle=cycle,
                    cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
                    doc_tokens=" ".join(words[:50])))

    def __len__(self):
        """Get the number of gold-standard tokens.

        RETURNS (int): The number of gold-standard tokens.
        """
        return self.length

    @property
    def is_projective(self):
        """Whether the provided syntactic annotations form a projective
        dependency tree.
        """
        return not nonproj.is_nonproj_tree(self.heads)