Move GoldParse under spacy.syntax

2025-11-05 02:17:25 +03:00 · 2020-06-06 15:09:25 +02:00 · 2020-06-06 15:09:25 +02:00 · 7b873ce2b1
commit 7b873ce2b1
parent 32c8fb1372
8 changed files with 357 additions and 7 deletions
--- a/spacy/syntax/arc_eager.pxd
+++ b/spacy/syntax/arc_eager.pxd
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
 from .stateclass cimport StateClass
 from ..typedefs cimport weight_t, attr_t
 from .transition_system cimport TransitionSystem, Transition
-from ..gold cimport GoldParseC
+from .gold_parse cimport GoldParseC
 cdef class ArcEager(TransitionSystem):
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -8,7 +8,7 @@ import json
 from ..typedefs cimport hash_t, attr_t
 from ..strings cimport hash_string
-from ..gold cimport GoldParse, GoldParseC
+from .gold_parse cimport GoldParse, GoldParseC
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc, set_children_from_heads
 from .stateclass cimport StateClass
--- a/spacy/syntax/gold_parse.pxd
+++ b/spacy/syntax/gold_parse.pxd
@ -0,0 +1,39 @@
 from cymem.cymem cimport Pool
 from .transition_system cimport Transition
 from ..typedefs cimport attr_t
 cdef struct GoldParseC:
    int* tags
    int* heads
    int* has_dep
    int* sent_start
    attr_t* labels
    int** brackets
    Transition* ner
 cdef class GoldParse:
    cdef Pool mem
    cdef GoldParseC c
    cdef readonly object orig
    cdef int length
    cdef public int loss
    cdef public list words
    cdef public list tags
    cdef public list pos
    cdef public list morphs
    cdef public list lemmas
    cdef public list sent_starts
    cdef public list heads
    cdef public list labels
    cdef public dict orths
    cdef public list ner
    cdef public dict brackets
    cdef public dict cats
    cdef public dict links
    cdef readonly list cand_to_gold
    cdef readonly list gold_to_cand
--- a/spacy/syntax/gold_parse.pyx
+++ b/spacy/syntax/gold_parse.pyx
@ -0,0 +1,311 @@
 # cython: profile=True
 import re
 import random
 import numpy
 import tempfile
 import shutil
 import itertools
 from pathlib import Path
 import srsly
 import warnings
 from .. import util
 from ..syntax import nonproj
 from ..tokens import Doc, Span
 from ..errors import Errors, AlignmentError, Warnings
 from .iob_utils import offsets_from_biluo_tags
 from .align import align
 punct_re = re.compile(r"\W")
 def is_punct_label(label):
    return label == "P" or label.lower() == "punct"
 cdef class GoldParse:
    """Collection for training annotations.
    DOCS: https://spacy.io/api/goldparse
    """
    @classmethod
    def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
        return cls(doc, words=token_annotation.words,
                   tags=token_annotation.tags,
                   pos=token_annotation.pos,
                   morphs=token_annotation.morphs,
                   lemmas=token_annotation.lemmas,
                   heads=token_annotation.heads,
                   deps=token_annotation.deps,
                   entities=token_annotation.entities,
                   sent_starts=token_annotation.sent_starts,
                   cats=doc_annotation.cats,
                   links=doc_annotation.links,
                   make_projective=make_projective)
    def get_token_annotation(self):
        ids = None
        if self.words:
            ids = list(range(len(self.words)))
        return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
                               pos=self.pos, morphs=self.morphs,
                               lemmas=self.lemmas, heads=self.heads,
                               deps=self.labels, entities=self.ner,
                               sent_starts=self.sent_starts)
    def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
                 lemmas=None, heads=None, deps=None, entities=None,
                 sent_starts=None, make_projective=False, cats=None,
                 links=None):
        """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
        doc (Doc): The document the annotations refer to.
        words (iterable): A sequence of unicode word strings.
        tags (iterable): A sequence of strings, representing tag annotations.
        pos (iterable): A sequence of strings, representing UPOS annotations.
        morphs (iterable): A sequence of strings, representing morph
            annotations.
        lemmas (iterable): A sequence of strings, representing lemma
            annotations.
        heads (iterable): A sequence of integers, representing syntactic
            head offsets.
        deps (iterable): A sequence of strings, representing the syntactic
            relation types.
        entities (iterable): A sequence of named entity annotations, either as
            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
            representing the entity positions.
        sent_starts (iterable): A sequence of sentence position tags, 1 for
            the first word in a sentence, 0 for all others.
        cats (dict): Labels for text classification. Each key in the dictionary
            may be a string or an int, or a `(start_char, end_char, label)`
            tuple, indicating that the label is applied to only part of the
            document (usually a sentence). Unlike entity annotations, label
            annotations can overlap, i.e. a single word can be covered by
            multiple labelled spans. The TextCategorizer component expects
            true examples of a label to have the value 1.0, and negative
            examples of a label to have the value 0.0. Labels not in the
            dictionary are treated as missing - the gradient for those labels
            will be zero.
        links (dict): A dict with `(start_char, end_char)` keys,
            and the values being dicts with kb_id:value entries,
            representing the external IDs in a knowledge base (KB)
            mapped to either 1.0 or 0.0, indicating positive and
            negative examples respectively.
        RETURNS (GoldParse): The newly constructed object.
        """
        self.mem = Pool()
        self.loss = 0
        self.length = len(doc)
        self.cats = {} if cats is None else dict(cats)
        self.links = {} if links is None else dict(links)
        # temporary doc for aligning entity annotation
        entdoc = None
        # avoid allocating memory if the doc does not contain any tokens
        if self.length == 0:
            self.words = []
            self.tags = []
            self.heads = []
            self.labels = []
            self.ner = []
            self.morphs = []
            # set a minimal orig so that the scorer can score an empty doc
            self.orig = TokenAnnotation(ids=[])
        else:
            if not words:
                words = [token.text for token in doc]
            if not tags:
                tags = [None for _ in words]
            if not pos:
                pos = [None for _ in words]
            if not morphs:
                morphs = [None for _ in words]
            if not lemmas:
                lemmas = [None for _ in words]
            if not heads:
                heads = [None for _ in words]
            if not deps:
                deps = [None for _ in words]
            if not sent_starts:
                sent_starts = [None for _ in words]
            if entities is None:
                entities = ["-" for _ in words]
            elif len(entities) == 0:
                entities = ["O" for _ in words]
            else:
                # Translate the None values to '-', to make processing easier.
                # See Issue #2603
                entities = [(ent if ent is not None else "-") for ent in entities]
                if not isinstance(entities[0], str):
                    # Assume we have entities specified by character offset.
                    # Create a temporary Doc corresponding to provided words
                    # (to preserve gold tokenization) and text (to preserve
                    # character offsets).
                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
                    entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
                    # There may be some additional whitespace tokens in the
                    # temporary doc, so check that the annotations align with
                    # the provided words while building a list of BILUO labels.
                    entities = []
                    words_offset = 0
                    for i in range(len(entdoc_words)):
                        if words[i + words_offset] == entdoc_words[i]:
                            entities.append(entdoc_entities[i])
                        else:
                            words_offset -= 1
                    if len(entities) != len(words):
                        warnings.warn(Warnings.W029.format(text=doc.text))
                        entities = ["-" for _ in words]
            # These are filled by the tagger/parser/entity recogniser
            self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
            self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
            self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
            self.words = [None] * len(doc)
            self.tags = [None] * len(doc)
            self.pos = [None] * len(doc)
            self.morphs = [None] * len(doc)
            self.lemmas = [None] * len(doc)
            self.heads = [None] * len(doc)
            self.labels = [None] * len(doc)
            self.ner = [None] * len(doc)
            self.sent_starts = [None] * len(doc)
            # This needs to be done before we align the words
            if make_projective and any(heads) and any(deps) :
                heads, deps = nonproj.projectivize(heads, deps)
            # Do many-to-one alignment for misaligned tokens.
            # If we over-segment, we'll have one gold word that covers a sequence
            # of predicted words
            # If we under-segment, we'll have one predicted word that covers a
            # sequence of gold words.
            # If we "mis-segment", we'll have a sequence of predicted words covering
            # a sequence of gold words. That's many-to-many -- we don't do that
            # except for NER spans where the start and end can be aligned.
            cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
            self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
            self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
            self.orig = TokenAnnotation(ids=list(range(len(words))),
                    words=words, tags=tags, pos=pos, morphs=morphs,
                    lemmas=lemmas, heads=heads, deps=deps, entities=entities,
                    sent_starts=sent_starts, brackets=[])
            for i, gold_i in enumerate(self.cand_to_gold):
                if doc[i].text.isspace():
                    self.words[i] = doc[i].text
                    self.tags[i] = "_SP"
                    self.pos[i] = "SPACE"
                    self.morphs[i] = None
                    self.lemmas[i] = None
                    self.heads[i] = None
                    self.labels[i] = None
                    self.ner[i] = None
                    self.sent_starts[i] = 0
                if gold_i is None:
                    if i in i2j_multi:
                        self.words[i] = words[i2j_multi[i]]
                        self.tags[i] = tags[i2j_multi[i]]
                        self.pos[i] = pos[i2j_multi[i]]
                        self.morphs[i] = morphs[i2j_multi[i]]
                        self.lemmas[i] = lemmas[i2j_multi[i]]
                        self.sent_starts[i] = sent_starts[i2j_multi[i]]
                        is_last = i2j_multi[i] != i2j_multi.get(i+1)
                        # Set next word in multi-token span as head, until last
                        if not is_last:
                            self.heads[i] = i+1
                            self.labels[i] = "subtok"
                        else:
                            head_i = heads[i2j_multi[i]]
                            if head_i:
                                self.heads[i] = self.gold_to_cand[head_i]
                            self.labels[i] = deps[i2j_multi[i]]
                        ner_tag = entities[i2j_multi[i]]
                        # Assign O/- for many-to-one O/- NER tags
                        if ner_tag in ("O", "-"):
                             self.ner[i] = ner_tag
                else:
                    self.words[i] = words[gold_i]
                    self.tags[i] = tags[gold_i]
                    self.pos[i] = pos[gold_i]
                    self.morphs[i] = morphs[gold_i]
                    self.lemmas[i] = lemmas[gold_i]
                    self.sent_starts[i] = sent_starts[gold_i]
                    if heads[gold_i] is None:
                        self.heads[i] = None
                    else:
                        self.heads[i] = self.gold_to_cand[heads[gold_i]]
                    self.labels[i] = deps[gold_i]
                    self.ner[i] = entities[gold_i]
            # Assign O/- for one-to-many O/- NER tags
            for j, cand_j in enumerate(self.gold_to_cand):
                if cand_j is None:
                    if j in j2i_multi:
                        i = j2i_multi[j]
                        ner_tag = entities[j]
                        if ner_tag in ("O", "-"):
                            self.ner[i] = ner_tag
            # If there is entity annotation and some tokens remain unaligned,
            # align all entities at the character level to account for all
            # possible token misalignments within the entity spans
            if any([e not in ("O", "-") for e in entities]) and None in self.ner:
                # If the temporary entdoc wasn't created above, initialize it
                if not entdoc:
                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
                # Get offsets based on gold words and BILUO entities
                entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
                aligned_offsets = []
                aligned_spans = []
                # Filter offsets to identify those that align with doc tokens
                for offset in entdoc_offsets:
                    span = doc.char_span(offset[0], offset[1])
                    if span and not span.text.isspace():
                        aligned_offsets.append(offset)
                        aligned_spans.append(span)
                # Convert back to BILUO for doc tokens and assign NER for all
                # aligned spans
                biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
                for span in aligned_spans:
                    for i in range(span.start, span.end):
                        self.ner[i] = biluo_tags[i]
            # Prevent whitespace that isn't within entities from being tagged as
            # an entity.
            for i in range(len(self.ner)):
                if self.tags[i] == "_SP":
                    prev_ner = self.ner[i-1] if i >= 1 else None
                    next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
                    if prev_ner == "O" or next_ner == "O":
                        self.ner[i] = "O"
            cycle = nonproj.contains_cycle(self.heads)
            if cycle is not None:
                raise ValueError(Errors.E069.format(cycle=cycle,
                    cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
                    doc_tokens=" ".join(words[:50])))
    def __len__(self):
        """Get the number of gold-standard tokens.
        RETURNS (int): The number of gold-standard tokens.
        """
        return self.length
    @property
    def is_projective(self):
        """Whether the provided syntactic annotations form a projective
        dependency tree.
        """
        return not nonproj.is_nonproj_tree(self.heads)
--- a/spacy/syntax/ner.pxd
+++ b/spacy/syntax/ner.pxd
@ -1,6 +1,6 @@
 from .transition_system cimport TransitionSystem
 from .transition_system cimport Transition
-from ..gold cimport GoldParseC
+from .gold_parse cimport GoldParseC
 from ..typedefs cimport attr_t
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -7,7 +7,7 @@ from .stateclass cimport StateClass
 from ._state cimport StateC
 from .transition_system cimport Transition
 from .transition_system cimport do_func_t
-from ..gold cimport GoldParseC, GoldParse
+from .gold_parse cimport GoldParseC, GoldParse
 from ..lexeme cimport Lexeme
 from ..attrs cimport IS_SPACE
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -20,7 +20,7 @@ import numpy
 import warnings
 from ..tokens.doc cimport Doc
-from ..gold cimport GoldParse
+from .gold_parse cimport GoldParse
 from ..typedefs cimport weight_t, class_t, hash_t
 from ._parser_model cimport alloc_activations, free_activations
 from ._parser_model cimport predict_states, arg_max_if_valid
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -2,8 +2,8 @@ from cymem.cymem cimport Pool
 from ..typedefs cimport attr_t, weight_t
 from ..structs cimport TokenC
-from ..gold cimport GoldParse
+from .gold_parse cimport GoldParse
-from ..gold cimport GoldParseC
+from .gold_parse cimport GoldParseC
 from ..strings cimport StringStore
 from .stateclass cimport StateClass
 from ._state cimport StateC