mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 17:10:36 +03:00
Move GoldParse under spacy.syntax
This commit is contained in:
parent
32c8fb1372
commit
7b873ce2b1
|
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
|
|||
from .stateclass cimport StateClass
|
||||
from ..typedefs cimport weight_t, attr_t
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..gold cimport GoldParseC
|
||||
from .gold_parse cimport GoldParseC
|
||||
|
||||
|
||||
cdef class ArcEager(TransitionSystem):
|
||||
|
|
|
@ -8,7 +8,7 @@ import json
|
|||
|
||||
from ..typedefs cimport hash_t, attr_t
|
||||
from ..strings cimport hash_string
|
||||
from ..gold cimport GoldParse, GoldParseC
|
||||
from .gold_parse cimport GoldParse, GoldParseC
|
||||
from ..structs cimport TokenC
|
||||
from ..tokens.doc cimport Doc, set_children_from_heads
|
||||
from .stateclass cimport StateClass
|
||||
|
|
39
spacy/syntax/gold_parse.pxd
Normal file
39
spacy/syntax/gold_parse.pxd
Normal file
|
@ -0,0 +1,39 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from .transition_system cimport Transition
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
|
||||
cdef struct GoldParseC:
|
||||
int* tags
|
||||
int* heads
|
||||
int* has_dep
|
||||
int* sent_start
|
||||
attr_t* labels
|
||||
int** brackets
|
||||
Transition* ner
|
||||
|
||||
|
||||
cdef class GoldParse:
|
||||
cdef Pool mem
|
||||
|
||||
cdef GoldParseC c
|
||||
cdef readonly object orig
|
||||
|
||||
cdef int length
|
||||
cdef public int loss
|
||||
cdef public list words
|
||||
cdef public list tags
|
||||
cdef public list pos
|
||||
cdef public list morphs
|
||||
cdef public list lemmas
|
||||
cdef public list sent_starts
|
||||
cdef public list heads
|
||||
cdef public list labels
|
||||
cdef public dict orths
|
||||
cdef public list ner
|
||||
cdef public dict brackets
|
||||
cdef public dict cats
|
||||
cdef public dict links
|
||||
|
||||
cdef readonly list cand_to_gold
|
||||
cdef readonly list gold_to_cand
|
311
spacy/syntax/gold_parse.pyx
Normal file
311
spacy/syntax/gold_parse.pyx
Normal file
|
@ -0,0 +1,311 @@
|
|||
# cython: profile=True
|
||||
import re
|
||||
import random
|
||||
import numpy
|
||||
import tempfile
|
||||
import shutil
|
||||
import itertools
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
import warnings
|
||||
|
||||
from .. import util
|
||||
from ..syntax import nonproj
|
||||
from ..tokens import Doc, Span
|
||||
from ..errors import Errors, AlignmentError, Warnings
|
||||
from .iob_utils import offsets_from_biluo_tags
|
||||
from .align import align
|
||||
|
||||
|
||||
punct_re = re.compile(r"\W")
|
||||
|
||||
def is_punct_label(label):
|
||||
return label == "P" or label.lower() == "punct"
|
||||
|
||||
|
||||
cdef class GoldParse:
|
||||
"""Collection for training annotations.
|
||||
|
||||
DOCS: https://spacy.io/api/goldparse
|
||||
"""
|
||||
@classmethod
|
||||
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
|
||||
return cls(doc, words=token_annotation.words,
|
||||
tags=token_annotation.tags,
|
||||
pos=token_annotation.pos,
|
||||
morphs=token_annotation.morphs,
|
||||
lemmas=token_annotation.lemmas,
|
||||
heads=token_annotation.heads,
|
||||
deps=token_annotation.deps,
|
||||
entities=token_annotation.entities,
|
||||
sent_starts=token_annotation.sent_starts,
|
||||
cats=doc_annotation.cats,
|
||||
links=doc_annotation.links,
|
||||
make_projective=make_projective)
|
||||
|
||||
def get_token_annotation(self):
|
||||
ids = None
|
||||
if self.words:
|
||||
ids = list(range(len(self.words)))
|
||||
|
||||
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
|
||||
pos=self.pos, morphs=self.morphs,
|
||||
lemmas=self.lemmas, heads=self.heads,
|
||||
deps=self.labels, entities=self.ner,
|
||||
sent_starts=self.sent_starts)
|
||||
|
||||
def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
|
||||
lemmas=None, heads=None, deps=None, entities=None,
|
||||
sent_starts=None, make_projective=False, cats=None,
|
||||
links=None):
|
||||
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
|
||||
|
||||
doc (Doc): The document the annotations refer to.
|
||||
words (iterable): A sequence of unicode word strings.
|
||||
tags (iterable): A sequence of strings, representing tag annotations.
|
||||
pos (iterable): A sequence of strings, representing UPOS annotations.
|
||||
morphs (iterable): A sequence of strings, representing morph
|
||||
annotations.
|
||||
lemmas (iterable): A sequence of strings, representing lemma
|
||||
annotations.
|
||||
heads (iterable): A sequence of integers, representing syntactic
|
||||
head offsets.
|
||||
deps (iterable): A sequence of strings, representing the syntactic
|
||||
relation types.
|
||||
entities (iterable): A sequence of named entity annotations, either as
|
||||
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
||||
representing the entity positions.
|
||||
sent_starts (iterable): A sequence of sentence position tags, 1 for
|
||||
the first word in a sentence, 0 for all others.
|
||||
cats (dict): Labels for text classification. Each key in the dictionary
|
||||
may be a string or an int, or a `(start_char, end_char, label)`
|
||||
tuple, indicating that the label is applied to only part of the
|
||||
document (usually a sentence). Unlike entity annotations, label
|
||||
annotations can overlap, i.e. a single word can be covered by
|
||||
multiple labelled spans. The TextCategorizer component expects
|
||||
true examples of a label to have the value 1.0, and negative
|
||||
examples of a label to have the value 0.0. Labels not in the
|
||||
dictionary are treated as missing - the gradient for those labels
|
||||
will be zero.
|
||||
links (dict): A dict with `(start_char, end_char)` keys,
|
||||
and the values being dicts with kb_id:value entries,
|
||||
representing the external IDs in a knowledge base (KB)
|
||||
mapped to either 1.0 or 0.0, indicating positive and
|
||||
negative examples respectively.
|
||||
RETURNS (GoldParse): The newly constructed object.
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self.loss = 0
|
||||
self.length = len(doc)
|
||||
|
||||
self.cats = {} if cats is None else dict(cats)
|
||||
self.links = {} if links is None else dict(links)
|
||||
|
||||
# temporary doc for aligning entity annotation
|
||||
entdoc = None
|
||||
|
||||
# avoid allocating memory if the doc does not contain any tokens
|
||||
if self.length == 0:
|
||||
self.words = []
|
||||
self.tags = []
|
||||
self.heads = []
|
||||
self.labels = []
|
||||
self.ner = []
|
||||
self.morphs = []
|
||||
# set a minimal orig so that the scorer can score an empty doc
|
||||
self.orig = TokenAnnotation(ids=[])
|
||||
else:
|
||||
if not words:
|
||||
words = [token.text for token in doc]
|
||||
if not tags:
|
||||
tags = [None for _ in words]
|
||||
if not pos:
|
||||
pos = [None for _ in words]
|
||||
if not morphs:
|
||||
morphs = [None for _ in words]
|
||||
if not lemmas:
|
||||
lemmas = [None for _ in words]
|
||||
if not heads:
|
||||
heads = [None for _ in words]
|
||||
if not deps:
|
||||
deps = [None for _ in words]
|
||||
if not sent_starts:
|
||||
sent_starts = [None for _ in words]
|
||||
if entities is None:
|
||||
entities = ["-" for _ in words]
|
||||
elif len(entities) == 0:
|
||||
entities = ["O" for _ in words]
|
||||
else:
|
||||
# Translate the None values to '-', to make processing easier.
|
||||
# See Issue #2603
|
||||
entities = [(ent if ent is not None else "-") for ent in entities]
|
||||
if not isinstance(entities[0], str):
|
||||
# Assume we have entities specified by character offset.
|
||||
# Create a temporary Doc corresponding to provided words
|
||||
# (to preserve gold tokenization) and text (to preserve
|
||||
# character offsets).
|
||||
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
|
||||
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
|
||||
entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
|
||||
# There may be some additional whitespace tokens in the
|
||||
# temporary doc, so check that the annotations align with
|
||||
# the provided words while building a list of BILUO labels.
|
||||
entities = []
|
||||
words_offset = 0
|
||||
for i in range(len(entdoc_words)):
|
||||
if words[i + words_offset] == entdoc_words[i]:
|
||||
entities.append(entdoc_entities[i])
|
||||
else:
|
||||
words_offset -= 1
|
||||
if len(entities) != len(words):
|
||||
warnings.warn(Warnings.W029.format(text=doc.text))
|
||||
entities = ["-" for _ in words]
|
||||
|
||||
# These are filled by the tagger/parser/entity recogniser
|
||||
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
||||
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||
|
||||
self.words = [None] * len(doc)
|
||||
self.tags = [None] * len(doc)
|
||||
self.pos = [None] * len(doc)
|
||||
self.morphs = [None] * len(doc)
|
||||
self.lemmas = [None] * len(doc)
|
||||
self.heads = [None] * len(doc)
|
||||
self.labels = [None] * len(doc)
|
||||
self.ner = [None] * len(doc)
|
||||
self.sent_starts = [None] * len(doc)
|
||||
|
||||
# This needs to be done before we align the words
|
||||
if make_projective and any(heads) and any(deps) :
|
||||
heads, deps = nonproj.projectivize(heads, deps)
|
||||
|
||||
# Do many-to-one alignment for misaligned tokens.
|
||||
# If we over-segment, we'll have one gold word that covers a sequence
|
||||
# of predicted words
|
||||
# If we under-segment, we'll have one predicted word that covers a
|
||||
# sequence of gold words.
|
||||
# If we "mis-segment", we'll have a sequence of predicted words covering
|
||||
# a sequence of gold words. That's many-to-many -- we don't do that
|
||||
# except for NER spans where the start and end can be aligned.
|
||||
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
||||
|
||||
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
||||
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
||||
|
||||
self.orig = TokenAnnotation(ids=list(range(len(words))),
|
||||
words=words, tags=tags, pos=pos, morphs=morphs,
|
||||
lemmas=lemmas, heads=heads, deps=deps, entities=entities,
|
||||
sent_starts=sent_starts, brackets=[])
|
||||
|
||||
for i, gold_i in enumerate(self.cand_to_gold):
|
||||
if doc[i].text.isspace():
|
||||
self.words[i] = doc[i].text
|
||||
self.tags[i] = "_SP"
|
||||
self.pos[i] = "SPACE"
|
||||
self.morphs[i] = None
|
||||
self.lemmas[i] = None
|
||||
self.heads[i] = None
|
||||
self.labels[i] = None
|
||||
self.ner[i] = None
|
||||
self.sent_starts[i] = 0
|
||||
if gold_i is None:
|
||||
if i in i2j_multi:
|
||||
self.words[i] = words[i2j_multi[i]]
|
||||
self.tags[i] = tags[i2j_multi[i]]
|
||||
self.pos[i] = pos[i2j_multi[i]]
|
||||
self.morphs[i] = morphs[i2j_multi[i]]
|
||||
self.lemmas[i] = lemmas[i2j_multi[i]]
|
||||
self.sent_starts[i] = sent_starts[i2j_multi[i]]
|
||||
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
||||
# Set next word in multi-token span as head, until last
|
||||
if not is_last:
|
||||
self.heads[i] = i+1
|
||||
self.labels[i] = "subtok"
|
||||
else:
|
||||
head_i = heads[i2j_multi[i]]
|
||||
if head_i:
|
||||
self.heads[i] = self.gold_to_cand[head_i]
|
||||
self.labels[i] = deps[i2j_multi[i]]
|
||||
ner_tag = entities[i2j_multi[i]]
|
||||
# Assign O/- for many-to-one O/- NER tags
|
||||
if ner_tag in ("O", "-"):
|
||||
self.ner[i] = ner_tag
|
||||
else:
|
||||
self.words[i] = words[gold_i]
|
||||
self.tags[i] = tags[gold_i]
|
||||
self.pos[i] = pos[gold_i]
|
||||
self.morphs[i] = morphs[gold_i]
|
||||
self.lemmas[i] = lemmas[gold_i]
|
||||
self.sent_starts[i] = sent_starts[gold_i]
|
||||
if heads[gold_i] is None:
|
||||
self.heads[i] = None
|
||||
else:
|
||||
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
||||
self.labels[i] = deps[gold_i]
|
||||
self.ner[i] = entities[gold_i]
|
||||
# Assign O/- for one-to-many O/- NER tags
|
||||
for j, cand_j in enumerate(self.gold_to_cand):
|
||||
if cand_j is None:
|
||||
if j in j2i_multi:
|
||||
i = j2i_multi[j]
|
||||
ner_tag = entities[j]
|
||||
if ner_tag in ("O", "-"):
|
||||
self.ner[i] = ner_tag
|
||||
|
||||
# If there is entity annotation and some tokens remain unaligned,
|
||||
# align all entities at the character level to account for all
|
||||
# possible token misalignments within the entity spans
|
||||
if any([e not in ("O", "-") for e in entities]) and None in self.ner:
|
||||
# If the temporary entdoc wasn't created above, initialize it
|
||||
if not entdoc:
|
||||
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
|
||||
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
|
||||
# Get offsets based on gold words and BILUO entities
|
||||
entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
|
||||
aligned_offsets = []
|
||||
aligned_spans = []
|
||||
# Filter offsets to identify those that align with doc tokens
|
||||
for offset in entdoc_offsets:
|
||||
span = doc.char_span(offset[0], offset[1])
|
||||
if span and not span.text.isspace():
|
||||
aligned_offsets.append(offset)
|
||||
aligned_spans.append(span)
|
||||
# Convert back to BILUO for doc tokens and assign NER for all
|
||||
# aligned spans
|
||||
biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
|
||||
for span in aligned_spans:
|
||||
for i in range(span.start, span.end):
|
||||
self.ner[i] = biluo_tags[i]
|
||||
|
||||
# Prevent whitespace that isn't within entities from being tagged as
|
||||
# an entity.
|
||||
for i in range(len(self.ner)):
|
||||
if self.tags[i] == "_SP":
|
||||
prev_ner = self.ner[i-1] if i >= 1 else None
|
||||
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
|
||||
if prev_ner == "O" or next_ner == "O":
|
||||
self.ner[i] = "O"
|
||||
|
||||
cycle = nonproj.contains_cycle(self.heads)
|
||||
if cycle is not None:
|
||||
raise ValueError(Errors.E069.format(cycle=cycle,
|
||||
cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
|
||||
doc_tokens=" ".join(words[:50])))
|
||||
|
||||
def __len__(self):
|
||||
"""Get the number of gold-standard tokens.
|
||||
|
||||
RETURNS (int): The number of gold-standard tokens.
|
||||
"""
|
||||
return self.length
|
||||
|
||||
@property
|
||||
def is_projective(self):
|
||||
"""Whether the provided syntactic annotations form a projective
|
||||
dependency tree.
|
||||
"""
|
||||
return not nonproj.is_nonproj_tree(self.heads)
|
|
@ -1,6 +1,6 @@
|
|||
from .transition_system cimport TransitionSystem
|
||||
from .transition_system cimport Transition
|
||||
from ..gold cimport GoldParseC
|
||||
from .gold_parse cimport GoldParseC
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ from .stateclass cimport StateClass
|
|||
from ._state cimport StateC
|
||||
from .transition_system cimport Transition
|
||||
from .transition_system cimport do_func_t
|
||||
from ..gold cimport GoldParseC, GoldParse
|
||||
from .gold_parse cimport GoldParseC, GoldParse
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..attrs cimport IS_SPACE
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ import numpy
|
|||
import warnings
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..gold cimport GoldParse
|
||||
from .gold_parse cimport GoldParse
|
||||
from ..typedefs cimport weight_t, class_t, hash_t
|
||||
from ._parser_model cimport alloc_activations, free_activations
|
||||
from ._parser_model cimport predict_states, arg_max_if_valid
|
||||
|
|
|
@ -2,8 +2,8 @@ from cymem.cymem cimport Pool
|
|||
|
||||
from ..typedefs cimport attr_t, weight_t
|
||||
from ..structs cimport TokenC
|
||||
from ..gold cimport GoldParse
|
||||
from ..gold cimport GoldParseC
|
||||
from .gold_parse cimport GoldParse
|
||||
from .gold_parse cimport GoldParseC
|
||||
from ..strings cimport StringStore
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
|
|
Loading…
Reference in New Issue
Block a user