Move GoldParse under spacy.syntax

2025-11-03 01:17:52 +03:00 · 2020-06-06 15:09:25 +02:00 · 2020-06-06 15:09:25 +02:00 · 7b873ce2b1
commit 7b873ce2b1
parent 32c8fb1372
8 changed files with 357 additions and 7 deletions
--- a/spacy/syntax/arc_eager.pxd
+++ b/spacy/syntax/arc_eager.pxd
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
 from .stateclass cimport StateClass
 from ..typedefs cimport weight_t, attr_t
 from .transition_system cimport TransitionSystem, Transition
-from ..gold cimport GoldParseC
+from .gold_parse cimport GoldParseC


 cdef class ArcEager(TransitionSystem):
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -8,7 +8,7 @@ import json

 from ..typedefs cimport hash_t, attr_t
 from ..strings cimport hash_string
-from ..gold cimport GoldParse, GoldParseC
+from .gold_parse cimport GoldParse, GoldParseC
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc, set_children_from_heads
 from .stateclass cimport StateClass
--- a/spacy/syntax/gold_parse.pxd
+++ b/spacy/syntax/gold_parse.pxd
@ -0,0 +1,39 @@
+from cymem.cymem cimport Pool
+from .transition_system cimport Transition
+from ..typedefs cimport attr_t
+
+
+cdef struct GoldParseC:
+    int* tags
+    int* heads
+    int* has_dep
+    int* sent_start
+    attr_t* labels
+    int** brackets
+    Transition* ner
+
+
+cdef class GoldParse:
+    cdef Pool mem
+
+    cdef GoldParseC c
+    cdef readonly object orig
+
+    cdef int length
+    cdef public int loss
+    cdef public list words
+    cdef public list tags
+    cdef public list pos
+    cdef public list morphs
+    cdef public list lemmas
+    cdef public list sent_starts
+    cdef public list heads
+    cdef public list labels
+    cdef public dict orths
+    cdef public list ner
+    cdef public dict brackets
+    cdef public dict cats
+    cdef public dict links
+
+    cdef readonly list cand_to_gold
+    cdef readonly list gold_to_cand
--- a/spacy/syntax/gold_parse.pyx
+++ b/spacy/syntax/gold_parse.pyx
@ -0,0 +1,311 @@
+# cython: profile=True
+import re
+import random
+import numpy
+import tempfile
+import shutil
+import itertools
+from pathlib import Path
+import srsly
+import warnings
+
+from .. import util
+from ..syntax import nonproj
+from ..tokens import Doc, Span
+from ..errors import Errors, AlignmentError, Warnings
+from .iob_utils import offsets_from_biluo_tags
+from .align import align
+
+
+punct_re = re.compile(r"\W")
+
+def is_punct_label(label):
+    return label == "P" or label.lower() == "punct"
+
+
+cdef class GoldParse:
+    """Collection for training annotations.
+
+    DOCS: https://spacy.io/api/goldparse
+    """
+    @classmethod
+    def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
+        return cls(doc, words=token_annotation.words,
+                   tags=token_annotation.tags,
+                   pos=token_annotation.pos,
+                   morphs=token_annotation.morphs,
+                   lemmas=token_annotation.lemmas,
+                   heads=token_annotation.heads,
+                   deps=token_annotation.deps,
+                   entities=token_annotation.entities,
+                   sent_starts=token_annotation.sent_starts,
+                   cats=doc_annotation.cats,
+                   links=doc_annotation.links,
+                   make_projective=make_projective)
+
+    def get_token_annotation(self):
+        ids = None
+        if self.words:
+            ids = list(range(len(self.words)))
+
+        return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
+                               pos=self.pos, morphs=self.morphs,
+                               lemmas=self.lemmas, heads=self.heads,
+                               deps=self.labels, entities=self.ner,
+                               sent_starts=self.sent_starts)
+
+    def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
+                 lemmas=None, heads=None, deps=None, entities=None,
+                 sent_starts=None, make_projective=False, cats=None,
+                 links=None):
+        """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
+
+        doc (Doc): The document the annotations refer to.
+        words (iterable): A sequence of unicode word strings.
+        tags (iterable): A sequence of strings, representing tag annotations.
+        pos (iterable): A sequence of strings, representing UPOS annotations.
+        morphs (iterable): A sequence of strings, representing morph
+            annotations.
+        lemmas (iterable): A sequence of strings, representing lemma
+            annotations.
+        heads (iterable): A sequence of integers, representing syntactic
+            head offsets.
+        deps (iterable): A sequence of strings, representing the syntactic
+            relation types.
+        entities (iterable): A sequence of named entity annotations, either as
+            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
+            representing the entity positions.
+        sent_starts (iterable): A sequence of sentence position tags, 1 for
+            the first word in a sentence, 0 for all others.
+        cats (dict): Labels for text classification. Each key in the dictionary
+            may be a string or an int, or a `(start_char, end_char, label)`
+            tuple, indicating that the label is applied to only part of the
+            document (usually a sentence). Unlike entity annotations, label
+            annotations can overlap, i.e. a single word can be covered by
+            multiple labelled spans. The TextCategorizer component expects
+            true examples of a label to have the value 1.0, and negative
+            examples of a label to have the value 0.0. Labels not in the
+            dictionary are treated as missing - the gradient for those labels
+            will be zero.
+        links (dict): A dict with `(start_char, end_char)` keys,
+            and the values being dicts with kb_id:value entries,
+            representing the external IDs in a knowledge base (KB)
+            mapped to either 1.0 or 0.0, indicating positive and
+            negative examples respectively.
+        RETURNS (GoldParse): The newly constructed object.
+        """
+        self.mem = Pool()
+        self.loss = 0
+        self.length = len(doc)
+
+        self.cats = {} if cats is None else dict(cats)
+        self.links = {} if links is None else dict(links)
+
+        # temporary doc for aligning entity annotation
+        entdoc = None
+
+        # avoid allocating memory if the doc does not contain any tokens
+        if self.length == 0:
+            self.words = []
+            self.tags = []
+            self.heads = []
+            self.labels = []
+            self.ner = []
+            self.morphs = []
+            # set a minimal orig so that the scorer can score an empty doc
+            self.orig = TokenAnnotation(ids=[])
+        else:
+            if not words:
+                words = [token.text for token in doc]
+            if not tags:
+                tags = [None for _ in words]
+            if not pos:
+                pos = [None for _ in words]
+            if not morphs:
+                morphs = [None for _ in words]
+            if not lemmas:
+                lemmas = [None for _ in words]
+            if not heads:
+                heads = [None for _ in words]
+            if not deps:
+                deps = [None for _ in words]
+            if not sent_starts:
+                sent_starts = [None for _ in words]
+            if entities is None:
+                entities = ["-" for _ in words]
+            elif len(entities) == 0:
+                entities = ["O" for _ in words]
+            else:
+                # Translate the None values to '-', to make processing easier.
+                # See Issue #2603
+                entities = [(ent if ent is not None else "-") for ent in entities]
+                if not isinstance(entities[0], str):
+                    # Assume we have entities specified by character offset.
+                    # Create a temporary Doc corresponding to provided words
+                    # (to preserve gold tokenization) and text (to preserve
+                    # character offsets).
+                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
+                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
+                    entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
+                    # There may be some additional whitespace tokens in the
+                    # temporary doc, so check that the annotations align with
+                    # the provided words while building a list of BILUO labels.
+                    entities = []
+                    words_offset = 0
+                    for i in range(len(entdoc_words)):
+                        if words[i + words_offset] == entdoc_words[i]:
+                            entities.append(entdoc_entities[i])
+                        else:
+                            words_offset -= 1
+                    if len(entities) != len(words):
+                        warnings.warn(Warnings.W029.format(text=doc.text))
+                        entities = ["-" for _ in words]
+
+            # These are filled by the tagger/parser/entity recogniser
+            self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
+            self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
+            self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
+
+            self.words = [None] * len(doc)
+            self.tags = [None] * len(doc)
+            self.pos = [None] * len(doc)
+            self.morphs = [None] * len(doc)
+            self.lemmas = [None] * len(doc)
+            self.heads = [None] * len(doc)
+            self.labels = [None] * len(doc)
+            self.ner = [None] * len(doc)
+            self.sent_starts = [None] * len(doc)
+
+            # This needs to be done before we align the words
+            if make_projective and any(heads) and any(deps) :
+                heads, deps = nonproj.projectivize(heads, deps)
+
+            # Do many-to-one alignment for misaligned tokens.
+            # If we over-segment, we'll have one gold word that covers a sequence
+            # of predicted words
+            # If we under-segment, we'll have one predicted word that covers a
+            # sequence of gold words.
+            # If we "mis-segment", we'll have a sequence of predicted words covering
+            # a sequence of gold words. That's many-to-many -- we don't do that
+            # except for NER spans where the start and end can be aligned.
+            cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
+
+            self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
+            self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
+
+            self.orig = TokenAnnotation(ids=list(range(len(words))),
+                    words=words, tags=tags, pos=pos, morphs=morphs,
+                    lemmas=lemmas, heads=heads, deps=deps, entities=entities,
+                    sent_starts=sent_starts, brackets=[])
+
+            for i, gold_i in enumerate(self.cand_to_gold):
+                if doc[i].text.isspace():
+                    self.words[i] = doc[i].text
+                    self.tags[i] = "_SP"
+                    self.pos[i] = "SPACE"
+                    self.morphs[i] = None
+                    self.lemmas[i] = None
+                    self.heads[i] = None
+                    self.labels[i] = None
+                    self.ner[i] = None
+                    self.sent_starts[i] = 0
+                if gold_i is None:
+                    if i in i2j_multi:
+                        self.words[i] = words[i2j_multi[i]]
+                        self.tags[i] = tags[i2j_multi[i]]
+                        self.pos[i] = pos[i2j_multi[i]]
+                        self.morphs[i] = morphs[i2j_multi[i]]
+                        self.lemmas[i] = lemmas[i2j_multi[i]]
+                        self.sent_starts[i] = sent_starts[i2j_multi[i]]
+                        is_last = i2j_multi[i] != i2j_multi.get(i+1)
+                        # Set next word in multi-token span as head, until last
+                        if not is_last:
+                            self.heads[i] = i+1
+                            self.labels[i] = "subtok"
+                        else:
+                            head_i = heads[i2j_multi[i]]
+                            if head_i:
+                                self.heads[i] = self.gold_to_cand[head_i]
+                            self.labels[i] = deps[i2j_multi[i]]
+                        ner_tag = entities[i2j_multi[i]]
+                        # Assign O/- for many-to-one O/- NER tags
+                        if ner_tag in ("O", "-"):
+                             self.ner[i] = ner_tag
+                else:
+                    self.words[i] = words[gold_i]
+                    self.tags[i] = tags[gold_i]
+                    self.pos[i] = pos[gold_i]
+                    self.morphs[i] = morphs[gold_i]
+                    self.lemmas[i] = lemmas[gold_i]
+                    self.sent_starts[i] = sent_starts[gold_i]
+                    if heads[gold_i] is None:
+                        self.heads[i] = None
+                    else:
+                        self.heads[i] = self.gold_to_cand[heads[gold_i]]
+                    self.labels[i] = deps[gold_i]
+                    self.ner[i] = entities[gold_i]
+            # Assign O/- for one-to-many O/- NER tags
+            for j, cand_j in enumerate(self.gold_to_cand):
+                if cand_j is None:
+                    if j in j2i_multi:
+                        i = j2i_multi[j]
+                        ner_tag = entities[j]
+                        if ner_tag in ("O", "-"):
+                            self.ner[i] = ner_tag
+
+            # If there is entity annotation and some tokens remain unaligned,
+            # align all entities at the character level to account for all
+            # possible token misalignments within the entity spans
+            if any([e not in ("O", "-") for e in entities]) and None in self.ner:
+                # If the temporary entdoc wasn't created above, initialize it
+                if not entdoc:
+                    entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
+                    entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
+                # Get offsets based on gold words and BILUO entities
+                entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
+                aligned_offsets = []
+                aligned_spans = []
+                # Filter offsets to identify those that align with doc tokens
+                for offset in entdoc_offsets:
+                    span = doc.char_span(offset[0], offset[1])
+                    if span and not span.text.isspace():
+                        aligned_offsets.append(offset)
+                        aligned_spans.append(span)
+                # Convert back to BILUO for doc tokens and assign NER for all
+                # aligned spans
+                biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
+                for span in aligned_spans:
+                    for i in range(span.start, span.end):
+                        self.ner[i] = biluo_tags[i]
+
+            # Prevent whitespace that isn't within entities from being tagged as
+            # an entity.
+            for i in range(len(self.ner)):
+                if self.tags[i] == "_SP":
+                    prev_ner = self.ner[i-1] if i >= 1 else None
+                    next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
+                    if prev_ner == "O" or next_ner == "O":
+                        self.ner[i] = "O"
+
+            cycle = nonproj.contains_cycle(self.heads)
+            if cycle is not None:
+                raise ValueError(Errors.E069.format(cycle=cycle,
+                    cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
+                    doc_tokens=" ".join(words[:50])))
+
+    def __len__(self):
+        """Get the number of gold-standard tokens.
+
+        RETURNS (int): The number of gold-standard tokens.
+        """
+        return self.length
+
+    @property
+    def is_projective(self):
+        """Whether the provided syntactic annotations form a projective
+        dependency tree.
+        """
+        return not nonproj.is_nonproj_tree(self.heads)
--- a/spacy/syntax/ner.pxd
+++ b/spacy/syntax/ner.pxd
@ -1,6 +1,6 @@
 from .transition_system cimport TransitionSystem
 from .transition_system cimport Transition
-from ..gold cimport GoldParseC
+from .gold_parse cimport GoldParseC
 from ..typedefs cimport attr_t


--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -7,7 +7,7 @@ from .stateclass cimport StateClass
 from ._state cimport StateC
 from .transition_system cimport Transition
 from .transition_system cimport do_func_t
-from ..gold cimport GoldParseC, GoldParse
+from .gold_parse cimport GoldParseC, GoldParse
 from ..lexeme cimport Lexeme
 from ..attrs cimport IS_SPACE

--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -20,7 +20,7 @@ import numpy
 import warnings

 from ..tokens.doc cimport Doc
-from ..gold cimport GoldParse
+from .gold_parse cimport GoldParse
 from ..typedefs cimport weight_t, class_t, hash_t
 from ._parser_model cimport alloc_activations, free_activations
 from ._parser_model cimport predict_states, arg_max_if_valid
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -2,8 +2,8 @@ from cymem.cymem cimport Pool

 from ..typedefs cimport attr_t, weight_t
 from ..structs cimport TokenC
-from ..gold cimport GoldParse
-from ..gold cimport GoldParseC
+from .gold_parse cimport GoldParse
+from .gold_parse cimport GoldParseC
 from ..strings cimport StringStore
 from .stateclass cimport StateClass
 from ._state cimport StateC