diff --git a/spacy/syntax/arc_eager.pxd b/spacy/syntax/arc_eager.pxd index 14d706548..96dd37a36 100644 --- a/spacy/syntax/arc_eager.pxd +++ b/spacy/syntax/arc_eager.pxd @@ -3,7 +3,7 @@ from cymem.cymem cimport Pool from .stateclass cimport StateClass from ..typedefs cimport weight_t, attr_t from .transition_system cimport TransitionSystem, Transition -from ..gold cimport GoldParseC +from .gold_parse cimport GoldParseC cdef class ArcEager(TransitionSystem): diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 19be95f3f..df8c7d563 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -8,7 +8,7 @@ import json from ..typedefs cimport hash_t, attr_t from ..strings cimport hash_string -from ..gold cimport GoldParse, GoldParseC +from .gold_parse cimport GoldParse, GoldParseC from ..structs cimport TokenC from ..tokens.doc cimport Doc, set_children_from_heads from .stateclass cimport StateClass diff --git a/spacy/syntax/gold_parse.pxd b/spacy/syntax/gold_parse.pxd new file mode 100644 index 000000000..9815513d0 --- /dev/null +++ b/spacy/syntax/gold_parse.pxd @@ -0,0 +1,39 @@ +from cymem.cymem cimport Pool +from .transition_system cimport Transition +from ..typedefs cimport attr_t + + +cdef struct GoldParseC: + int* tags + int* heads + int* has_dep + int* sent_start + attr_t* labels + int** brackets + Transition* ner + + +cdef class GoldParse: + cdef Pool mem + + cdef GoldParseC c + cdef readonly object orig + + cdef int length + cdef public int loss + cdef public list words + cdef public list tags + cdef public list pos + cdef public list morphs + cdef public list lemmas + cdef public list sent_starts + cdef public list heads + cdef public list labels + cdef public dict orths + cdef public list ner + cdef public dict brackets + cdef public dict cats + cdef public dict links + + cdef readonly list cand_to_gold + cdef readonly list gold_to_cand diff --git a/spacy/syntax/gold_parse.pyx b/spacy/syntax/gold_parse.pyx new file mode 100644 index 000000000..59e8f4bbb --- /dev/null +++ b/spacy/syntax/gold_parse.pyx @@ -0,0 +1,311 @@ +# cython: profile=True +import re +import random +import numpy +import tempfile +import shutil +import itertools +from pathlib import Path +import srsly +import warnings + +from .. import util +from ..syntax import nonproj +from ..tokens import Doc, Span +from ..errors import Errors, AlignmentError, Warnings +from .iob_utils import offsets_from_biluo_tags +from .align import align + + +punct_re = re.compile(r"\W") + +def is_punct_label(label): + return label == "P" or label.lower() == "punct" + + +cdef class GoldParse: + """Collection for training annotations. + + DOCS: https://spacy.io/api/goldparse + """ + @classmethod + def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False): + return cls(doc, words=token_annotation.words, + tags=token_annotation.tags, + pos=token_annotation.pos, + morphs=token_annotation.morphs, + lemmas=token_annotation.lemmas, + heads=token_annotation.heads, + deps=token_annotation.deps, + entities=token_annotation.entities, + sent_starts=token_annotation.sent_starts, + cats=doc_annotation.cats, + links=doc_annotation.links, + make_projective=make_projective) + + def get_token_annotation(self): + ids = None + if self.words: + ids = list(range(len(self.words))) + + return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, + pos=self.pos, morphs=self.morphs, + lemmas=self.lemmas, heads=self.heads, + deps=self.labels, entities=self.ner, + sent_starts=self.sent_starts) + + def __init__(self, doc, words=None, tags=None, pos=None, morphs=None, + lemmas=None, heads=None, deps=None, entities=None, + sent_starts=None, make_projective=False, cats=None, + links=None): + """Create a GoldParse. The fields will not be initialized if len(doc) is zero. + + doc (Doc): The document the annotations refer to. + words (iterable): A sequence of unicode word strings. + tags (iterable): A sequence of strings, representing tag annotations. + pos (iterable): A sequence of strings, representing UPOS annotations. + morphs (iterable): A sequence of strings, representing morph + annotations. + lemmas (iterable): A sequence of strings, representing lemma + annotations. + heads (iterable): A sequence of integers, representing syntactic + head offsets. + deps (iterable): A sequence of strings, representing the syntactic + relation types. + entities (iterable): A sequence of named entity annotations, either as + BILUO tag strings, or as `(start_char, end_char, label)` tuples, + representing the entity positions. + sent_starts (iterable): A sequence of sentence position tags, 1 for + the first word in a sentence, 0 for all others. + cats (dict): Labels for text classification. Each key in the dictionary + may be a string or an int, or a `(start_char, end_char, label)` + tuple, indicating that the label is applied to only part of the + document (usually a sentence). Unlike entity annotations, label + annotations can overlap, i.e. a single word can be covered by + multiple labelled spans. The TextCategorizer component expects + true examples of a label to have the value 1.0, and negative + examples of a label to have the value 0.0. Labels not in the + dictionary are treated as missing - the gradient for those labels + will be zero. + links (dict): A dict with `(start_char, end_char)` keys, + and the values being dicts with kb_id:value entries, + representing the external IDs in a knowledge base (KB) + mapped to either 1.0 or 0.0, indicating positive and + negative examples respectively. + RETURNS (GoldParse): The newly constructed object. + """ + self.mem = Pool() + self.loss = 0 + self.length = len(doc) + + self.cats = {} if cats is None else dict(cats) + self.links = {} if links is None else dict(links) + + # temporary doc for aligning entity annotation + entdoc = None + + # avoid allocating memory if the doc does not contain any tokens + if self.length == 0: + self.words = [] + self.tags = [] + self.heads = [] + self.labels = [] + self.ner = [] + self.morphs = [] + # set a minimal orig so that the scorer can score an empty doc + self.orig = TokenAnnotation(ids=[]) + else: + if not words: + words = [token.text for token in doc] + if not tags: + tags = [None for _ in words] + if not pos: + pos = [None for _ in words] + if not morphs: + morphs = [None for _ in words] + if not lemmas: + lemmas = [None for _ in words] + if not heads: + heads = [None for _ in words] + if not deps: + deps = [None for _ in words] + if not sent_starts: + sent_starts = [None for _ in words] + if entities is None: + entities = ["-" for _ in words] + elif len(entities) == 0: + entities = ["O" for _ in words] + else: + # Translate the None values to '-', to make processing easier. + # See Issue #2603 + entities = [(ent if ent is not None else "-") for ent in entities] + if not isinstance(entities[0], str): + # Assume we have entities specified by character offset. + # Create a temporary Doc corresponding to provided words + # (to preserve gold tokenization) and text (to preserve + # character offsets). + entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) + entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) + entdoc_entities = biluo_tags_from_offsets(entdoc, entities) + # There may be some additional whitespace tokens in the + # temporary doc, so check that the annotations align with + # the provided words while building a list of BILUO labels. + entities = [] + words_offset = 0 + for i in range(len(entdoc_words)): + if words[i + words_offset] == entdoc_words[i]: + entities.append(entdoc_entities[i]) + else: + words_offset -= 1 + if len(entities) != len(words): + warnings.warn(Warnings.W029.format(text=doc.text)) + entities = ["-" for _ in words] + + # These are filled by the tagger/parser/entity recogniser + self.c.tags = self.mem.alloc(len(doc), sizeof(int)) + self.c.heads = self.mem.alloc(len(doc), sizeof(int)) + self.c.labels = self.mem.alloc(len(doc), sizeof(attr_t)) + self.c.has_dep = self.mem.alloc(len(doc), sizeof(int)) + self.c.sent_start = self.mem.alloc(len(doc), sizeof(int)) + self.c.ner = self.mem.alloc(len(doc), sizeof(Transition)) + + self.words = [None] * len(doc) + self.tags = [None] * len(doc) + self.pos = [None] * len(doc) + self.morphs = [None] * len(doc) + self.lemmas = [None] * len(doc) + self.heads = [None] * len(doc) + self.labels = [None] * len(doc) + self.ner = [None] * len(doc) + self.sent_starts = [None] * len(doc) + + # This needs to be done before we align the words + if make_projective and any(heads) and any(deps) : + heads, deps = nonproj.projectivize(heads, deps) + + # Do many-to-one alignment for misaligned tokens. + # If we over-segment, we'll have one gold word that covers a sequence + # of predicted words + # If we under-segment, we'll have one predicted word that covers a + # sequence of gold words. + # If we "mis-segment", we'll have a sequence of predicted words covering + # a sequence of gold words. That's many-to-many -- we don't do that + # except for NER spans where the start and end can be aligned. + cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words) + + self.cand_to_gold = [(j if j >= 0 else None) for j in i2j] + self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] + + self.orig = TokenAnnotation(ids=list(range(len(words))), + words=words, tags=tags, pos=pos, morphs=morphs, + lemmas=lemmas, heads=heads, deps=deps, entities=entities, + sent_starts=sent_starts, brackets=[]) + + for i, gold_i in enumerate(self.cand_to_gold): + if doc[i].text.isspace(): + self.words[i] = doc[i].text + self.tags[i] = "_SP" + self.pos[i] = "SPACE" + self.morphs[i] = None + self.lemmas[i] = None + self.heads[i] = None + self.labels[i] = None + self.ner[i] = None + self.sent_starts[i] = 0 + if gold_i is None: + if i in i2j_multi: + self.words[i] = words[i2j_multi[i]] + self.tags[i] = tags[i2j_multi[i]] + self.pos[i] = pos[i2j_multi[i]] + self.morphs[i] = morphs[i2j_multi[i]] + self.lemmas[i] = lemmas[i2j_multi[i]] + self.sent_starts[i] = sent_starts[i2j_multi[i]] + is_last = i2j_multi[i] != i2j_multi.get(i+1) + # Set next word in multi-token span as head, until last + if not is_last: + self.heads[i] = i+1 + self.labels[i] = "subtok" + else: + head_i = heads[i2j_multi[i]] + if head_i: + self.heads[i] = self.gold_to_cand[head_i] + self.labels[i] = deps[i2j_multi[i]] + ner_tag = entities[i2j_multi[i]] + # Assign O/- for many-to-one O/- NER tags + if ner_tag in ("O", "-"): + self.ner[i] = ner_tag + else: + self.words[i] = words[gold_i] + self.tags[i] = tags[gold_i] + self.pos[i] = pos[gold_i] + self.morphs[i] = morphs[gold_i] + self.lemmas[i] = lemmas[gold_i] + self.sent_starts[i] = sent_starts[gold_i] + if heads[gold_i] is None: + self.heads[i] = None + else: + self.heads[i] = self.gold_to_cand[heads[gold_i]] + self.labels[i] = deps[gold_i] + self.ner[i] = entities[gold_i] + # Assign O/- for one-to-many O/- NER tags + for j, cand_j in enumerate(self.gold_to_cand): + if cand_j is None: + if j in j2i_multi: + i = j2i_multi[j] + ner_tag = entities[j] + if ner_tag in ("O", "-"): + self.ner[i] = ner_tag + + # If there is entity annotation and some tokens remain unaligned, + # align all entities at the character level to account for all + # possible token misalignments within the entity spans + if any([e not in ("O", "-") for e in entities]) and None in self.ner: + # If the temporary entdoc wasn't created above, initialize it + if not entdoc: + entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text) + entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces) + # Get offsets based on gold words and BILUO entities + entdoc_offsets = offsets_from_biluo_tags(entdoc, entities) + aligned_offsets = [] + aligned_spans = [] + # Filter offsets to identify those that align with doc tokens + for offset in entdoc_offsets: + span = doc.char_span(offset[0], offset[1]) + if span and not span.text.isspace(): + aligned_offsets.append(offset) + aligned_spans.append(span) + # Convert back to BILUO for doc tokens and assign NER for all + # aligned spans + biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None) + for span in aligned_spans: + for i in range(span.start, span.end): + self.ner[i] = biluo_tags[i] + + # Prevent whitespace that isn't within entities from being tagged as + # an entity. + for i in range(len(self.ner)): + if self.tags[i] == "_SP": + prev_ner = self.ner[i-1] if i >= 1 else None + next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None + if prev_ner == "O" or next_ner == "O": + self.ner[i] = "O" + + cycle = nonproj.contains_cycle(self.heads) + if cycle is not None: + raise ValueError(Errors.E069.format(cycle=cycle, + cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]), + doc_tokens=" ".join(words[:50]))) + + def __len__(self): + """Get the number of gold-standard tokens. + + RETURNS (int): The number of gold-standard tokens. + """ + return self.length + + @property + def is_projective(self): + """Whether the provided syntactic annotations form a projective + dependency tree. + """ + return not nonproj.is_nonproj_tree(self.heads) diff --git a/spacy/syntax/ner.pxd b/spacy/syntax/ner.pxd index 647f98fc0..739b8dc1f 100644 --- a/spacy/syntax/ner.pxd +++ b/spacy/syntax/ner.pxd @@ -1,6 +1,6 @@ from .transition_system cimport TransitionSystem from .transition_system cimport Transition -from ..gold cimport GoldParseC +from .gold_parse cimport GoldParseC from ..typedefs cimport attr_t diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index ff74be601..4061304d8 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -7,7 +7,7 @@ from .stateclass cimport StateClass from ._state cimport StateC from .transition_system cimport Transition from .transition_system cimport do_func_t -from ..gold cimport GoldParseC, GoldParse +from .gold_parse cimport GoldParseC, GoldParse from ..lexeme cimport Lexeme from ..attrs cimport IS_SPACE diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index fcaff444e..12f56ba67 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -20,7 +20,7 @@ import numpy import warnings from ..tokens.doc cimport Doc -from ..gold cimport GoldParse +from .gold_parse cimport GoldParse from ..typedefs cimport weight_t, class_t, hash_t from ._parser_model cimport alloc_activations, free_activations from ._parser_model cimport predict_states, arg_max_if_valid diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 5fd3b5c5f..33f96c331 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -2,8 +2,8 @@ from cymem.cymem cimport Pool from ..typedefs cimport attr_t, weight_t from ..structs cimport TokenC -from ..gold cimport GoldParse -from ..gold cimport GoldParseC +from .gold_parse cimport GoldParse +from .gold_parse cimport GoldParseC from ..strings cimport StringStore from .stateclass cimport StateClass from ._state cimport StateC