spaCy/spacy/syntax/gold_parse.pyx
2020-06-13 23:11:29 +02:00

347 lines
15 KiB
Cython

# cython: profile=True
import re
import random
import numpy
import tempfile
import shutil
import itertools
from pathlib import Path
import srsly
import warnings
from .. import util
from . import nonproj
from ..tokens import Doc, Span
from ..errors import Errors, AlignmentError, Warnings
from ..gold.annotation import TokenAnnotation
from ..gold.iob_utils import offsets_from_biluo_tags, biluo_tags_from_offsets
from ..gold.align import align
punct_re = re.compile(r"\W")
def is_punct_label(label):
return label == "P" or label.lower() == "punct"
def get_parses_from_example(
example, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
):
"""Return a list of (doc, GoldParse) objects.
If merge is set to True, keep all Token annotations as one big list."""
# merge == do not modify Example
if merge:
examples = [example]
else:
# not merging: one GoldParse per sentence, defining docs with the words
# from each sentence
examples = example.split_sents()
outputs = []
for eg in examples:
eg_dict = eg.to_dict()
try:
gp = GoldParse.from_annotation(
eg.predicted,
eg_dict["doc_annotation"],
eg_dict["token_annotation"],
make_projective=make_projective
)
except AlignmentError:
if ignore_misaligned:
gp = None
else:
raise
outputs.append((eg.predicted, gp))
return outputs
cdef class GoldParse:
"""Collection for training annotations.
DOCS: https://spacy.io/api/goldparse
"""
@classmethod
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
return cls(
doc,
words=token_annotation["words"],
tags=token_annotation["tags"],
pos=token_annotation["pos"],
morphs=token_annotation["morphs"],
lemmas=token_annotation["lemmas"],
heads=token_annotation["heads"],
deps=token_annotation["deps"],
entities=token_annotation["entities"],
sent_starts=token_annotation["sent_starts"],
cats=doc_annotation["cats"],
links=doc_annotation["links"],
make_projective=make_projective
)
def get_token_annotation(self):
ids = None
if self.words:
ids = list(range(len(self.words)))
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
pos=self.pos, morphs=self.morphs,
lemmas=self.lemmas, heads=self.heads,
deps=self.labels, entities=self.ner,
sent_starts=self.sent_starts)
def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
lemmas=None, heads=None, deps=None, entities=None,
sent_starts=None, make_projective=False, cats=None,
links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations.
pos (iterable): A sequence of strings, representing UPOS annotations.
morphs (iterable): A sequence of strings, representing morph
annotations.
lemmas (iterable): A sequence of strings, representing lemma
annotations.
heads (iterable): A sequence of integers, representing syntactic
head offsets.
deps (iterable): A sequence of strings, representing the syntactic
relation types.
entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions.
sent_starts (iterable): A sequence of sentence position tags, 1 for
the first word in a sentence, 0 for all others.
cats (dict): Labels for text classification. Each key in the dictionary
may be a string or an int, or a `(start_char, end_char, label)`
tuple, indicating that the label is applied to only part of the
document (usually a sentence). Unlike entity annotations, label
annotations can overlap, i.e. a single word can be covered by
multiple labelled spans. The TextCategorizer component expects
true examples of a label to have the value 1.0, and negative
examples of a label to have the value 0.0. Labels not in the
dictionary are treated as missing - the gradient for those labels
will be zero.
links (dict): A dict with `(start_char, end_char)` keys,
and the values being dicts with kb_id:value entries,
representing the external IDs in a knowledge base (KB)
mapped to either 1.0 or 0.0, indicating positive and
negative examples respectively.
RETURNS (GoldParse): The newly constructed object.
"""
self.mem = Pool()
self.loss = 0
self.length = len(doc)
self.cats = {} if cats is None else dict(cats)
self.links = {} if links is None else dict(links)
# temporary doc for aligning entity annotation
entdoc = None
# avoid allocating memory if the doc does not contain any tokens
if self.length == 0:
self.words = []
self.tags = []
self.heads = []
self.labels = []
self.ner = []
self.morphs = []
# set a minimal orig so that the scorer can score an empty doc
self.orig = TokenAnnotation(ids=[])
else:
if not words:
words = [token.text for token in doc]
if not tags:
tags = [None for _ in words]
if not pos:
pos = [None for _ in words]
if not morphs:
morphs = [None for _ in words]
if not lemmas:
lemmas = [None for _ in words]
if not heads:
heads = [None for _ in words]
if not deps:
deps = [None for _ in words]
if not sent_starts:
sent_starts = [None for _ in words]
if entities is None:
entities = ["-" for _ in words]
elif len(entities) == 0:
entities = ["O" for _ in words]
else:
# Translate the None values to '-', to make processing easier.
# See Issue #2603
entities = [(ent if ent is not None else "-") for ent in entities]
if not isinstance(entities[0], str):
# Assume we have entities specified by character offset.
# Create a temporary Doc corresponding to provided words
# (to preserve gold tokenization) and text (to preserve
# character offsets).
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
# There may be some additional whitespace tokens in the
# temporary doc, so check that the annotations align with
# the provided words while building a list of BILUO labels.
entities = []
words_offset = 0
for i in range(len(entdoc_words)):
if words[i + words_offset] == entdoc_words[i]:
entities.append(entdoc_entities[i])
else:
words_offset -= 1
if len(entities) != len(words):
warnings.warn(Warnings.W029.format(text=doc.text))
entities = ["-" for _ in words]
# These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.words = [None] * len(doc)
self.tags = [None] * len(doc)
self.pos = [None] * len(doc)
self.morphs = [None] * len(doc)
self.lemmas = [None] * len(doc)
self.heads = [None] * len(doc)
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
self.sent_starts = [None] * len(doc)
# This needs to be done before we align the words
if make_projective and any(heads) and any(deps) :
heads, deps = nonproj.projectivize(heads, deps)
# Do many-to-one alignment for misaligned tokens.
# If we over-segment, we'll have one gold word that covers a sequence
# of predicted words
# If we under-segment, we'll have one predicted word that covers a
# sequence of gold words.
# If we "mis-segment", we'll have a sequence of predicted words covering
# a sequence of gold words. That's many-to-many -- we don't do that
# except for NER spans where the start and end can be aligned.
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
self.orig = TokenAnnotation(ids=list(range(len(words))),
words=words, tags=tags, pos=pos, morphs=morphs,
lemmas=lemmas, heads=heads, deps=deps, entities=entities,
sent_starts=sent_starts, brackets=[])
for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace():
self.words[i] = doc[i].text
self.tags[i] = "_SP"
self.pos[i] = "SPACE"
self.morphs[i] = None
self.lemmas[i] = None
self.heads[i] = None
self.labels[i] = None
self.ner[i] = None
self.sent_starts[i] = 0
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
self.pos[i] = pos[i2j_multi[i]]
self.morphs[i] = morphs[i2j_multi[i]]
self.lemmas[i] = lemmas[i2j_multi[i]]
self.sent_starts[i] = sent_starts[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
# Set next word in multi-token span as head, until last
if not is_last:
self.heads[i] = i+1
self.labels[i] = "subtok"
else:
head_i = heads[i2j_multi[i]]
if head_i:
self.heads[i] = self.gold_to_cand[head_i]
self.labels[i] = deps[i2j_multi[i]]
ner_tag = entities[i2j_multi[i]]
# Assign O/- for many-to-one O/- NER tags
if ner_tag in ("O", "-"):
self.ner[i] = ner_tag
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
self.pos[i] = pos[gold_i]
self.morphs[i] = morphs[gold_i]
self.lemmas[i] = lemmas[gold_i]
self.sent_starts[i] = sent_starts[gold_i]
if heads[gold_i] is None:
self.heads[i] = None
else:
self.heads[i] = self.gold_to_cand[heads[gold_i]]
self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i]
# Assign O/- for one-to-many O/- NER tags
for j, cand_j in enumerate(self.gold_to_cand):
if cand_j is None:
if j in j2i_multi:
i = j2i_multi[j]
ner_tag = entities[j]
if ner_tag in ("O", "-"):
self.ner[i] = ner_tag
# If there is entity annotation and some tokens remain unaligned,
# align all entities at the character level to account for all
# possible token misalignments within the entity spans
if any([e not in ("O", "-") for e in entities]) and None in self.ner:
# If the temporary entdoc wasn't created above, initialize it
if not entdoc:
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
# Get offsets based on gold words and BILUO entities
entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
aligned_offsets = []
aligned_spans = []
# Filter offsets to identify those that align with doc tokens
for offset in entdoc_offsets:
span = doc.char_span(offset[0], offset[1])
if span and not span.text.isspace():
aligned_offsets.append(offset)
aligned_spans.append(span)
# Convert back to BILUO for doc tokens and assign NER for all
# aligned spans
biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
for span in aligned_spans:
for i in range(span.start, span.end):
self.ner[i] = biluo_tags[i]
# Prevent whitespace that isn't within entities from being tagged as
# an entity.
for i in range(len(self.ner)):
if self.tags[i] == "_SP":
prev_ner = self.ner[i-1] if i >= 1 else None
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
if prev_ner == "O" or next_ner == "O":
self.ner[i] = "O"
cycle = nonproj.contains_cycle(self.heads)
if cycle is not None:
raise ValueError(Errors.E069.format(cycle=cycle,
cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
doc_tokens=" ".join(words[:50])))
def __len__(self):
"""Get the number of gold-standard tokens.
RETURNS (int): The number of gold-standard tokens.
"""
return self.length
@property
def is_projective(self):
"""Whether the provided syntactic annotations form a projective
dependency tree.
"""
return not nonproj.is_nonproj_tree(self.heads)