spaCy/spacy/syntax/gold_parse.pyx
2020-06-06 15:09:43 +02:00

312 lines
14 KiB
Cython

# cython: profile=True
import re
import random
import numpy
import tempfile
import shutil
import itertools
from pathlib import Path
import srsly
import warnings
from .. import util
from ..syntax import nonproj
from ..tokens import Doc, Span
from ..errors import Errors, AlignmentError, Warnings
from .iob_utils import offsets_from_biluo_tags
from .align import align
punct_re = re.compile(r"\W")
def is_punct_label(label):
return label == "P" or label.lower() == "punct"
cdef class GoldParse:
"""Collection for training annotations.
DOCS: https://spacy.io/api/goldparse
"""
@classmethod
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
return cls(doc, words=token_annotation.words,
tags=token_annotation.tags,
pos=token_annotation.pos,
morphs=token_annotation.morphs,
lemmas=token_annotation.lemmas,
heads=token_annotation.heads,
deps=token_annotation.deps,
entities=token_annotation.entities,
sent_starts=token_annotation.sent_starts,
cats=doc_annotation.cats,
links=doc_annotation.links,
make_projective=make_projective)
def get_token_annotation(self):
ids = None
if self.words:
ids = list(range(len(self.words)))
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
pos=self.pos, morphs=self.morphs,
lemmas=self.lemmas, heads=self.heads,
deps=self.labels, entities=self.ner,
sent_starts=self.sent_starts)
def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
lemmas=None, heads=None, deps=None, entities=None,
sent_starts=None, make_projective=False, cats=None,
links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations.
pos (iterable): A sequence of strings, representing UPOS annotations.
morphs (iterable): A sequence of strings, representing morph
annotations.
lemmas (iterable): A sequence of strings, representing lemma
annotations.
heads (iterable): A sequence of integers, representing syntactic
head offsets.
deps (iterable): A sequence of strings, representing the syntactic
relation types.
entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions.
sent_starts (iterable): A sequence of sentence position tags, 1 for
the first word in a sentence, 0 for all others.
cats (dict): Labels for text classification. Each key in the dictionary
may be a string or an int, or a `(start_char, end_char, label)`
tuple, indicating that the label is applied to only part of the
document (usually a sentence). Unlike entity annotations, label
annotations can overlap, i.e. a single word can be covered by
multiple labelled spans. The TextCategorizer component expects
true examples of a label to have the value 1.0, and negative
examples of a label to have the value 0.0. Labels not in the
dictionary are treated as missing - the gradient for those labels
will be zero.
links (dict): A dict with `(start_char, end_char)` keys,
and the values being dicts with kb_id:value entries,
representing the external IDs in a knowledge base (KB)
mapped to either 1.0 or 0.0, indicating positive and
negative examples respectively.
RETURNS (GoldParse): The newly constructed object.
"""
self.mem = Pool()
self.loss = 0
self.length = len(doc)
self.cats = {} if cats is None else dict(cats)
self.links = {} if links is None else dict(links)
# temporary doc for aligning entity annotation
entdoc = None
# avoid allocating memory if the doc does not contain any tokens
if self.length == 0:
self.words = []
self.tags = []
self.heads = []
self.labels = []
self.ner = []
self.morphs = []
# set a minimal orig so that the scorer can score an empty doc
self.orig = TokenAnnotation(ids=[])
else:
if not words:
words = [token.text for token in doc]
if not tags:
tags = [None for _ in words]
if not pos:
pos = [None for _ in words]
if not morphs:
morphs = [None for _ in words]
if not lemmas:
lemmas = [None for _ in words]
if not heads:
heads = [None for _ in words]
if not deps:
deps = [None for _ in words]
if not sent_starts:
sent_starts = [None for _ in words]
if entities is None:
entities = ["-" for _ in words]
elif len(entities) == 0:
entities = ["O" for _ in words]
else:
# Translate the None values to '-', to make processing easier.
# See Issue #2603
entities = [(ent if ent is not None else "-") for ent in entities]
if not isinstance(entities[0], str):
# Assume we have entities specified by character offset.
# Create a temporary Doc corresponding to provided words
# (to preserve gold tokenization) and text (to preserve
# character offsets).
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
# There may be some additional whitespace tokens in the
# temporary doc, so check that the annotations align with
# the provided words while building a list of BILUO labels.
entities = []
words_offset = 0
for i in range(len(entdoc_words)):
if words[i + words_offset] == entdoc_words[i]:
entities.append(entdoc_entities[i])
else:
words_offset -= 1
if len(entities) != len(words):
warnings.warn(Warnings.W029.format(text=doc.text))
entities = ["-" for _ in words]
# These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.words = [None] * len(doc)
self.tags = [None] * len(doc)
self.pos = [None] * len(doc)
self.morphs = [None] * len(doc)
self.lemmas = [None] * len(doc)
self.heads = [None] * len(doc)
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
self.sent_starts = [None] * len(doc)
# This needs to be done before we align the words
if make_projective and any(heads) and any(deps) :
heads, deps = nonproj.projectivize(heads, deps)
# Do many-to-one alignment for misaligned tokens.
# If we over-segment, we'll have one gold word that covers a sequence
# of predicted words
# If we under-segment, we'll have one predicted word that covers a
# sequence of gold words.
# If we "mis-segment", we'll have a sequence of predicted words covering
# a sequence of gold words. That's many-to-many -- we don't do that
# except for NER spans where the start and end can be aligned.
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
self.orig = TokenAnnotation(ids=list(range(len(words))),
words=words, tags=tags, pos=pos, morphs=morphs,
lemmas=lemmas, heads=heads, deps=deps, entities=entities,
sent_starts=sent_starts, brackets=[])
for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace():
self.words[i] = doc[i].text
self.tags[i] = "_SP"
self.pos[i] = "SPACE"
self.morphs[i] = None
self.lemmas[i] = None
self.heads[i] = None
self.labels[i] = None
self.ner[i] = None
self.sent_starts[i] = 0
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
self.pos[i] = pos[i2j_multi[i]]
self.morphs[i] = morphs[i2j_multi[i]]
self.lemmas[i] = lemmas[i2j_multi[i]]
self.sent_starts[i] = sent_starts[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
# Set next word in multi-token span as head, until last
if not is_last:
self.heads[i] = i+1
self.labels[i] = "subtok"
else:
head_i = heads[i2j_multi[i]]
if head_i:
self.heads[i] = self.gold_to_cand[head_i]
self.labels[i] = deps[i2j_multi[i]]
ner_tag = entities[i2j_multi[i]]
# Assign O/- for many-to-one O/- NER tags
if ner_tag in ("O", "-"):
self.ner[i] = ner_tag
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
self.pos[i] = pos[gold_i]
self.morphs[i] = morphs[gold_i]
self.lemmas[i] = lemmas[gold_i]
self.sent_starts[i] = sent_starts[gold_i]
if heads[gold_i] is None:
self.heads[i] = None
else:
self.heads[i] = self.gold_to_cand[heads[gold_i]]
self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i]
# Assign O/- for one-to-many O/- NER tags
for j, cand_j in enumerate(self.gold_to_cand):
if cand_j is None:
if j in j2i_multi:
i = j2i_multi[j]
ner_tag = entities[j]
if ner_tag in ("O", "-"):
self.ner[i] = ner_tag
# If there is entity annotation and some tokens remain unaligned,
# align all entities at the character level to account for all
# possible token misalignments within the entity spans
if any([e not in ("O", "-") for e in entities]) and None in self.ner:
# If the temporary entdoc wasn't created above, initialize it
if not entdoc:
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
# Get offsets based on gold words and BILUO entities
entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
aligned_offsets = []
aligned_spans = []
# Filter offsets to identify those that align with doc tokens
for offset in entdoc_offsets:
span = doc.char_span(offset[0], offset[1])
if span and not span.text.isspace():
aligned_offsets.append(offset)
aligned_spans.append(span)
# Convert back to BILUO for doc tokens and assign NER for all
# aligned spans
biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
for span in aligned_spans:
for i in range(span.start, span.end):
self.ner[i] = biluo_tags[i]
# Prevent whitespace that isn't within entities from being tagged as
# an entity.
for i in range(len(self.ner)):
if self.tags[i] == "_SP":
prev_ner = self.ner[i-1] if i >= 1 else None
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
if prev_ner == "O" or next_ner == "O":
self.ner[i] = "O"
cycle = nonproj.contains_cycle(self.heads)
if cycle is not None:
raise ValueError(Errors.E069.format(cycle=cycle,
cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
doc_tokens=" ".join(words[:50])))
def __len__(self):
"""Get the number of gold-standard tokens.
RETURNS (int): The number of gold-standard tokens.
"""
return self.length
@property
def is_projective(self):
"""Whether the provided syntactic annotations form a projective
dependency tree.
"""
return not nonproj.is_nonproj_tree(self.heads)