mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-04 11:25:51 +03:00
Merge from whatif/arrow
This commit is contained in:
commit
d53723aa4f
5
setup.py
5
setup.py
|
@ -23,6 +23,8 @@ Options.docstrings = True
|
||||||
|
|
||||||
PACKAGES = find_packages()
|
PACKAGES = find_packages()
|
||||||
MOD_NAMES = [
|
MOD_NAMES = [
|
||||||
|
"spacy.gold.align",
|
||||||
|
"spacy.gold.new_example",
|
||||||
"spacy.parts_of_speech",
|
"spacy.parts_of_speech",
|
||||||
"spacy.strings",
|
"spacy.strings",
|
||||||
"spacy.lexeme",
|
"spacy.lexeme",
|
||||||
|
@ -35,13 +37,14 @@ MOD_NAMES = [
|
||||||
"spacy.syntax.stateclass",
|
"spacy.syntax.stateclass",
|
||||||
"spacy.syntax._state",
|
"spacy.syntax._state",
|
||||||
"spacy.tokenizer",
|
"spacy.tokenizer",
|
||||||
|
"spacy.syntax.gold_parse",
|
||||||
"spacy.syntax.nn_parser",
|
"spacy.syntax.nn_parser",
|
||||||
"spacy.syntax._parser_model",
|
"spacy.syntax._parser_model",
|
||||||
"spacy.syntax._beam_utils",
|
"spacy.syntax._beam_utils",
|
||||||
"spacy.syntax.nonproj",
|
"spacy.syntax.nonproj",
|
||||||
"spacy.syntax.transition_system",
|
"spacy.syntax.transition_system",
|
||||||
"spacy.syntax.arc_eager",
|
"spacy.syntax.arc_eager",
|
||||||
"spacy.gold",
|
"spacy.gold.gold_io",
|
||||||
"spacy.tokens.doc",
|
"spacy.tokens.doc",
|
||||||
"spacy.tokens.span",
|
"spacy.tokens.span",
|
||||||
"spacy.tokens.token",
|
"spacy.tokens.token",
|
||||||
|
|
|
@ -2,6 +2,7 @@ import re
|
||||||
|
|
||||||
from ...gold import Example
|
from ...gold import Example
|
||||||
from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
|
from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
|
||||||
|
from ...gold import TokenAnnotation
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc, Token
|
from ...tokens import Doc, Token
|
||||||
from .conll_ner2json import n_sents_info
|
from .conll_ner2json import n_sents_info
|
||||||
|
@ -284,13 +285,8 @@ def example_from_conllu_sentence(
|
||||||
spaces.append(t._.merged_spaceafter)
|
spaces.append(t._.merged_spaceafter)
|
||||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||||
ents = biluo_tags_from_offsets(doc, ent_offsets)
|
ents = biluo_tags_from_offsets(doc, ent_offsets)
|
||||||
raw = ""
|
example = Example(doc=Doc(vocab, words=words, spaces=spaces))
|
||||||
for word, space in zip(words, spaces):
|
example.token_annotation = TokenAnnotation(
|
||||||
raw += word
|
|
||||||
if space:
|
|
||||||
raw += " "
|
|
||||||
example = Example(doc=raw)
|
|
||||||
example.set_token_annotation(
|
|
||||||
ids=ids,
|
ids=ids,
|
||||||
words=words,
|
words=words,
|
||||||
tags=tags,
|
tags=tags,
|
||||||
|
|
|
@ -13,7 +13,11 @@ from thinc.api import Model, use_pytorch_for_gpu_memory
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus
|
||||||
|
<<<<<<< HEAD
|
||||||
|
from ..gold import Example
|
||||||
|
=======
|
||||||
from ..lookups import Lookups
|
from ..lookups import Lookups
|
||||||
|
>>>>>>> origin/develop
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..ml import models # don't remove - required to load the built-in architectures
|
from ..ml import models # don't remove - required to load the built-in architectures
|
||||||
|
@ -223,7 +227,6 @@ def train(
|
||||||
limit = training["limit"]
|
limit = training["limit"]
|
||||||
msg.info("Loading training corpus")
|
msg.info("Loading training corpus")
|
||||||
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
|
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
|
||||||
|
|
||||||
# verify textcat config
|
# verify textcat config
|
||||||
if "textcat" in nlp_config["pipeline"]:
|
if "textcat" in nlp_config["pipeline"]:
|
||||||
textcat_labels = set(nlp.get_pipe("textcat").labels)
|
textcat_labels = set(nlp.get_pipe("textcat").labels)
|
||||||
|
@ -281,9 +284,7 @@ def train(
|
||||||
nlp.resume_training()
|
nlp.resume_training()
|
||||||
else:
|
else:
|
||||||
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
|
||||||
nlp.begin_training(
|
nlp.begin_training(lambda: corpus.train_dataset(nlp))
|
||||||
lambda: corpus.train_examples
|
|
||||||
)
|
|
||||||
|
|
||||||
# Update tag map with provided mapping
|
# Update tag map with provided mapping
|
||||||
nlp.vocab.morphology.tag_map.update(tag_map)
|
nlp.vocab.morphology.tag_map.update(tag_map)
|
||||||
|
@ -373,6 +374,16 @@ def train(
|
||||||
def create_train_batches(nlp, corpus, cfg):
|
def create_train_batches(nlp, corpus, cfg):
|
||||||
epochs_todo = cfg.get("max_epochs", 0)
|
epochs_todo = cfg.get("max_epochs", 0)
|
||||||
while True:
|
while True:
|
||||||
|
<<<<<<< HEAD
|
||||||
|
train_examples = list(corpus.train_dataset(
|
||||||
|
nlp,
|
||||||
|
noise_level=0.0,
|
||||||
|
orth_variant_level=cfg["orth_variant_level"],
|
||||||
|
gold_preproc=cfg["gold_preproc"],
|
||||||
|
max_length=cfg["max_length"],
|
||||||
|
ignore_misaligned=True
|
||||||
|
))
|
||||||
|
=======
|
||||||
train_examples = list(
|
train_examples = list(
|
||||||
corpus.train_dataset(
|
corpus.train_dataset(
|
||||||
nlp,
|
nlp,
|
||||||
|
@ -383,6 +394,7 @@ def create_train_batches(nlp, corpus, cfg):
|
||||||
ignore_misaligned=True,
|
ignore_misaligned=True,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
>>>>>>> origin/develop
|
||||||
if len(train_examples) == 0:
|
if len(train_examples) == 0:
|
||||||
raise ValueError(Errors.E988)
|
raise ValueError(Errors.E988)
|
||||||
random.shuffle(train_examples)
|
random.shuffle(train_examples)
|
||||||
|
@ -413,6 +425,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
|
||||||
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
|
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
n_words = sum(len(ex.doc) for ex in dev_examples)
|
n_words = sum(len(ex.doc) for ex in dev_examples)
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
|
|
||||||
|
|
|
@ -620,6 +620,14 @@ class Errors(object):
|
||||||
E999 = ("Encountered an unexpected format for the dictionary holding "
|
E999 = ("Encountered an unexpected format for the dictionary holding "
|
||||||
"gold annotations: {gold_dict}")
|
"gold annotations: {gold_dict}")
|
||||||
|
|
||||||
|
# TODO: These were left over after a merge, but I couldn't find them?
|
||||||
|
#E983 = ("Each link annotation should refer to a dictionary with at most one "
|
||||||
|
# "identifier mapping to 1.0, and all others to 0.0.")
|
||||||
|
#E984 = ("The offsets of the annotations for 'links' need to refer exactly "
|
||||||
|
# "to the offsets of the 'entities' annotations.")
|
||||||
|
#E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
|
||||||
|
# "into {values}, but found {value}.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class TempErrors(object):
|
class TempErrors(object):
|
||||||
|
|
|
@ -1,68 +0,0 @@
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
from .typedefs cimport attr_t
|
|
||||||
from .syntax.transition_system cimport Transition
|
|
||||||
|
|
||||||
from .tokens import Doc
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct GoldParseC:
|
|
||||||
int* tags
|
|
||||||
int* heads
|
|
||||||
int* has_dep
|
|
||||||
int* sent_start
|
|
||||||
attr_t* labels
|
|
||||||
int** brackets
|
|
||||||
Transition* ner
|
|
||||||
|
|
||||||
|
|
||||||
cdef class GoldParse:
|
|
||||||
cdef Pool mem
|
|
||||||
|
|
||||||
cdef GoldParseC c
|
|
||||||
cdef readonly TokenAnnotation orig
|
|
||||||
|
|
||||||
cdef int length
|
|
||||||
cdef public int loss
|
|
||||||
cdef public list words
|
|
||||||
cdef public list tags
|
|
||||||
cdef public list pos
|
|
||||||
cdef public list morphs
|
|
||||||
cdef public list lemmas
|
|
||||||
cdef public list sent_starts
|
|
||||||
cdef public list heads
|
|
||||||
cdef public list labels
|
|
||||||
cdef public dict orths
|
|
||||||
cdef public list ner
|
|
||||||
cdef public dict brackets
|
|
||||||
cdef public dict cats
|
|
||||||
cdef public dict links
|
|
||||||
|
|
||||||
cdef readonly list cand_to_gold
|
|
||||||
cdef readonly list gold_to_cand
|
|
||||||
|
|
||||||
|
|
||||||
cdef class TokenAnnotation:
|
|
||||||
cdef public list ids
|
|
||||||
cdef public list words
|
|
||||||
cdef public list tags
|
|
||||||
cdef public list pos
|
|
||||||
cdef public list morphs
|
|
||||||
cdef public list lemmas
|
|
||||||
cdef public list heads
|
|
||||||
cdef public list deps
|
|
||||||
cdef public list entities
|
|
||||||
cdef public list sent_starts
|
|
||||||
cdef public dict brackets_by_start
|
|
||||||
|
|
||||||
|
|
||||||
cdef class DocAnnotation:
|
|
||||||
cdef public object cats
|
|
||||||
cdef public object links
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Example:
|
|
||||||
cdef public object doc
|
|
||||||
cdef public TokenAnnotation token_annotation
|
|
||||||
cdef public DocAnnotation doc_annotation
|
|
||||||
cdef public object goldparse
|
|
1419
spacy/gold.pyx
1419
spacy/gold.pyx
File diff suppressed because it is too large
Load Diff
0
spacy/gold/__init__.pxd
Normal file
0
spacy/gold/__init__.pxd
Normal file
13
spacy/gold/__init__.py
Normal file
13
spacy/gold/__init__.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
from .corpus import GoldCorpus
|
||||||
|
from ..syntax.gold_parse import GoldParse
|
||||||
|
from .example import Example
|
||||||
|
from .annotation import TokenAnnotation, DocAnnotation
|
||||||
|
from .align import align
|
||||||
|
|
||||||
|
from .iob_utils import iob_to_biluo, biluo_to_iob
|
||||||
|
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
|
from .iob_utils import spans_from_biluo_tags
|
||||||
|
from .iob_utils import tags_to_entities
|
||||||
|
|
||||||
|
from .gold_io import docs_to_json
|
||||||
|
from .gold_io import read_json_file
|
8
spacy/gold/align.pxd
Normal file
8
spacy/gold/align.pxd
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
cdef class Alignment:
|
||||||
|
cdef public object cost
|
||||||
|
cdef public object i2j
|
||||||
|
cdef public object j2i
|
||||||
|
cdef public object i2j_multi
|
||||||
|
cdef public object j2i_multi
|
||||||
|
cdef public object cand_to_gold
|
||||||
|
cdef public object gold_to_cand
|
101
spacy/gold/align.pyx
Normal file
101
spacy/gold/align.pyx
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
import numpy
|
||||||
|
from ..errors import Errors, AlignmentError
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Alignment:
|
||||||
|
def __init__(self, spacy_words, gold_words):
|
||||||
|
# Do many-to-one alignment for misaligned tokens.
|
||||||
|
# If we over-segment, we'll have one gold word that covers a sequence
|
||||||
|
# of predicted words
|
||||||
|
# If we under-segment, we'll have one predicted word that covers a
|
||||||
|
# sequence of gold words.
|
||||||
|
# If we "mis-segment", we'll have a sequence of predicted words covering
|
||||||
|
# a sequence of gold words. That's many-to-many -- we don't do that
|
||||||
|
# except for NER spans where the start and end can be aligned.
|
||||||
|
cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words)
|
||||||
|
self.cost = cost
|
||||||
|
self.i2j = i2j
|
||||||
|
self.j2i = j2i
|
||||||
|
self.i2j_multi = i2j_multi
|
||||||
|
self.j2i_multi = j2i_multi
|
||||||
|
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
||||||
|
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
||||||
|
|
||||||
|
|
||||||
|
def align(tokens_a, tokens_b):
|
||||||
|
"""Calculate alignment tables between two tokenizations.
|
||||||
|
|
||||||
|
tokens_a (List[str]): The candidate tokenization.
|
||||||
|
tokens_b (List[str]): The reference tokenization.
|
||||||
|
RETURNS: (tuple): A 5-tuple consisting of the following information:
|
||||||
|
* cost (int): The number of misaligned tokens.
|
||||||
|
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
|
||||||
|
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
|
||||||
|
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
|
||||||
|
it has the value -1.
|
||||||
|
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
|
||||||
|
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
|
||||||
|
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
|
||||||
|
the same token of `tokens_b`.
|
||||||
|
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
||||||
|
direction.
|
||||||
|
"""
|
||||||
|
tokens_a = _normalize_for_alignment(tokens_a)
|
||||||
|
tokens_b = _normalize_for_alignment(tokens_b)
|
||||||
|
cost = 0
|
||||||
|
a2b = numpy.empty(len(tokens_a), dtype="i")
|
||||||
|
b2a = numpy.empty(len(tokens_b), dtype="i")
|
||||||
|
a2b.fill(-1)
|
||||||
|
b2a.fill(-1)
|
||||||
|
a2b_multi = {}
|
||||||
|
b2a_multi = {}
|
||||||
|
i = 0
|
||||||
|
j = 0
|
||||||
|
offset_a = 0
|
||||||
|
offset_b = 0
|
||||||
|
while i < len(tokens_a) and j < len(tokens_b):
|
||||||
|
a = tokens_a[i][offset_a:]
|
||||||
|
b = tokens_b[j][offset_b:]
|
||||||
|
if a == b:
|
||||||
|
if offset_a == offset_b == 0:
|
||||||
|
a2b[i] = j
|
||||||
|
b2a[j] = i
|
||||||
|
elif offset_a == 0:
|
||||||
|
cost += 2
|
||||||
|
a2b_multi[i] = j
|
||||||
|
elif offset_b == 0:
|
||||||
|
cost += 2
|
||||||
|
b2a_multi[j] = i
|
||||||
|
offset_a = offset_b = 0
|
||||||
|
i += 1
|
||||||
|
j += 1
|
||||||
|
elif a == "":
|
||||||
|
assert offset_a == 0
|
||||||
|
cost += 1
|
||||||
|
i += 1
|
||||||
|
elif b == "":
|
||||||
|
assert offset_b == 0
|
||||||
|
cost += 1
|
||||||
|
j += 1
|
||||||
|
elif b.startswith(a):
|
||||||
|
cost += 1
|
||||||
|
if offset_a == 0:
|
||||||
|
a2b_multi[i] = j
|
||||||
|
i += 1
|
||||||
|
offset_a = 0
|
||||||
|
offset_b += len(a)
|
||||||
|
elif a.startswith(b):
|
||||||
|
cost += 1
|
||||||
|
if offset_b == 0:
|
||||||
|
b2a_multi[j] = i
|
||||||
|
j += 1
|
||||||
|
offset_b = 0
|
||||||
|
offset_a += len(b)
|
||||||
|
else:
|
||||||
|
assert "".join(tokens_a) != "".join(tokens_b)
|
||||||
|
raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
|
||||||
|
return cost, a2b, b2a, a2b_multi, b2a_multi
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_for_alignment(tokens):
|
||||||
|
return [w.replace(" ", "").lower() for w in tokens]
|
150
spacy/gold/annotation.py
Normal file
150
spacy/gold/annotation.py
Normal file
|
@ -0,0 +1,150 @@
|
||||||
|
from .iob_utils import biluo_tags_from_offsets
|
||||||
|
|
||||||
|
|
||||||
|
class TokenAnnotation:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
ids=None,
|
||||||
|
words=None,
|
||||||
|
tags=None,
|
||||||
|
pos=None,
|
||||||
|
morphs=None,
|
||||||
|
lemmas=None,
|
||||||
|
heads=None,
|
||||||
|
deps=None,
|
||||||
|
entities=None,
|
||||||
|
sent_starts=None,
|
||||||
|
brackets=None,
|
||||||
|
):
|
||||||
|
self.ids = ids if ids else []
|
||||||
|
self.words = words if words else []
|
||||||
|
self.tags = tags if tags else []
|
||||||
|
self.pos = pos if pos else []
|
||||||
|
self.morphs = morphs if morphs else []
|
||||||
|
self.lemmas = lemmas if lemmas else []
|
||||||
|
self.heads = heads if heads else []
|
||||||
|
self.deps = deps if deps else []
|
||||||
|
self.entities = entities if entities else []
|
||||||
|
self.sent_starts = sent_starts if sent_starts else []
|
||||||
|
self.brackets_by_start = {}
|
||||||
|
if brackets:
|
||||||
|
for b_start, b_end, b_label in brackets:
|
||||||
|
self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label))
|
||||||
|
|
||||||
|
def get_field(self, field):
|
||||||
|
if field == "id":
|
||||||
|
return self.ids
|
||||||
|
elif field == "word":
|
||||||
|
return self.words
|
||||||
|
elif field == "tag":
|
||||||
|
return self.tags
|
||||||
|
elif field == "pos":
|
||||||
|
return self.pos
|
||||||
|
elif field == "morph":
|
||||||
|
return self.morphs
|
||||||
|
elif field == "lemma":
|
||||||
|
return self.lemmas
|
||||||
|
elif field == "head":
|
||||||
|
return self.heads
|
||||||
|
elif field == "dep":
|
||||||
|
return self.deps
|
||||||
|
elif field == "ner":
|
||||||
|
return self.entities
|
||||||
|
elif field == "sent_start":
|
||||||
|
return self.sent_starts
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown field: {field}")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def brackets(self):
|
||||||
|
brackets = []
|
||||||
|
for start, ends_labels in self.brackets_by_start.items():
|
||||||
|
for end, label in ends_labels:
|
||||||
|
brackets.append((start, end, label))
|
||||||
|
return brackets
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, token_dict):
|
||||||
|
return cls(
|
||||||
|
ids=token_dict.get("ids", None),
|
||||||
|
words=token_dict.get("words", None),
|
||||||
|
tags=token_dict.get("tags", None),
|
||||||
|
pos=token_dict.get("pos", None),
|
||||||
|
morphs=token_dict.get("morphs", None),
|
||||||
|
lemmas=token_dict.get("lemmas", None),
|
||||||
|
heads=token_dict.get("heads", None),
|
||||||
|
deps=token_dict.get("deps", None),
|
||||||
|
entities=token_dict.get("entities", None),
|
||||||
|
sent_starts=token_dict.get("sent_starts", None),
|
||||||
|
brackets=token_dict.get("brackets", None),
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
"ids": self.ids,
|
||||||
|
"words": self.words,
|
||||||
|
"tags": self.tags,
|
||||||
|
"pos": self.pos,
|
||||||
|
"morphs": self.morphs,
|
||||||
|
"lemmas": self.lemmas,
|
||||||
|
"heads": self.heads,
|
||||||
|
"deps": self.deps,
|
||||||
|
"entities": self.entities,
|
||||||
|
"sent_starts": self.sent_starts,
|
||||||
|
"brackets": self.brackets,
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_id(self, i):
|
||||||
|
return self.ids[i] if i < len(self.ids) else i
|
||||||
|
|
||||||
|
def get_word(self, i):
|
||||||
|
return self.words[i] if i < len(self.words) else ""
|
||||||
|
|
||||||
|
def get_tag(self, i):
|
||||||
|
return self.tags[i] if i < len(self.tags) else "-"
|
||||||
|
|
||||||
|
def get_pos(self, i):
|
||||||
|
return self.pos[i] if i < len(self.pos) else ""
|
||||||
|
|
||||||
|
def get_morph(self, i):
|
||||||
|
return self.morphs[i] if i < len(self.morphs) else ""
|
||||||
|
|
||||||
|
def get_lemma(self, i):
|
||||||
|
return self.lemmas[i] if i < len(self.lemmas) else ""
|
||||||
|
|
||||||
|
def get_head(self, i):
|
||||||
|
return self.heads[i] if i < len(self.heads) else i
|
||||||
|
|
||||||
|
def get_dep(self, i):
|
||||||
|
return self.deps[i] if i < len(self.deps) else ""
|
||||||
|
|
||||||
|
def get_entity(self, i):
|
||||||
|
return self.entities[i] if i < len(self.entities) else "-"
|
||||||
|
|
||||||
|
def get_sent_start(self, i):
|
||||||
|
return self.sent_starts[i] if i < len(self.sent_starts) else None
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return str(self.to_dict())
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.__str__()
|
||||||
|
|
||||||
|
|
||||||
|
class DocAnnotation:
|
||||||
|
def __init__(self, cats=None, links=None):
|
||||||
|
self.cats = cats if cats else {}
|
||||||
|
self.links = links if links else {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, doc_dict):
|
||||||
|
return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None))
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {"cats": self.cats, "links": self.links}
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return str(self.to_dict())
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.__str__()
|
131
spacy/gold/augment.py
Normal file
131
spacy/gold/augment.py
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
import random
|
||||||
|
import itertools
|
||||||
|
from .example import Example
|
||||||
|
from .annotation import TokenAnnotation
|
||||||
|
|
||||||
|
|
||||||
|
def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
|
if random.random() >= orth_variant_level:
|
||||||
|
return example
|
||||||
|
if not example.token_annotation:
|
||||||
|
return example
|
||||||
|
raw = example.text
|
||||||
|
lower = False
|
||||||
|
if random.random() >= 0.5:
|
||||||
|
lower = True
|
||||||
|
if raw is not None:
|
||||||
|
raw = raw.lower()
|
||||||
|
ndsv = nlp.Defaults.single_orth_variants
|
||||||
|
ndpv = nlp.Defaults.paired_orth_variants
|
||||||
|
# modify words in paragraph_tuples
|
||||||
|
variant_example = Example(doc=nlp.make_doc(raw))
|
||||||
|
token_annotation = example.token_annotation
|
||||||
|
words = token_annotation.words
|
||||||
|
tags = token_annotation.tags
|
||||||
|
if not words or not tags:
|
||||||
|
# add the unmodified annotation
|
||||||
|
token_dict = token_annotation.to_dict()
|
||||||
|
variant_example.token_annotation = TokenAnnotation(**token_dict)
|
||||||
|
else:
|
||||||
|
if lower:
|
||||||
|
words = [w.lower() for w in words]
|
||||||
|
# single variants
|
||||||
|
punct_choices = [random.choice(x["variants"]) for x in ndsv]
|
||||||
|
for word_idx in range(len(words)):
|
||||||
|
for punct_idx in range(len(ndsv)):
|
||||||
|
if (
|
||||||
|
tags[word_idx] in ndsv[punct_idx]["tags"]
|
||||||
|
and words[word_idx] in ndsv[punct_idx]["variants"]
|
||||||
|
):
|
||||||
|
words[word_idx] = punct_choices[punct_idx]
|
||||||
|
# paired variants
|
||||||
|
punct_choices = [random.choice(x["variants"]) for x in ndpv]
|
||||||
|
for word_idx in range(len(words)):
|
||||||
|
for punct_idx in range(len(ndpv)):
|
||||||
|
if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
|
||||||
|
word_idx
|
||||||
|
] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
||||||
|
# backup option: random left vs. right from pair
|
||||||
|
pair_idx = random.choice([0, 1])
|
||||||
|
# best option: rely on paired POS tags like `` / ''
|
||||||
|
if len(ndpv[punct_idx]["tags"]) == 2:
|
||||||
|
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
|
||||||
|
# next best option: rely on position in variants
|
||||||
|
# (may not be unambiguous, so order of variants matters)
|
||||||
|
else:
|
||||||
|
for pair in ndpv[punct_idx]["variants"]:
|
||||||
|
if words[word_idx] in pair:
|
||||||
|
pair_idx = pair.index(words[word_idx])
|
||||||
|
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
||||||
|
|
||||||
|
token_dict = token_annotation.to_dict()
|
||||||
|
token_dict["words"] = words
|
||||||
|
token_dict["tags"] = tags
|
||||||
|
variant_example.token_annotation = TokenAnnotation(**token_dict)
|
||||||
|
# modify raw to match variant_paragraph_tuples
|
||||||
|
if raw is not None:
|
||||||
|
variants = []
|
||||||
|
for single_variants in ndsv:
|
||||||
|
variants.extend(single_variants["variants"])
|
||||||
|
for paired_variants in ndpv:
|
||||||
|
variants.extend(
|
||||||
|
list(itertools.chain.from_iterable(paired_variants["variants"]))
|
||||||
|
)
|
||||||
|
# store variants in reverse length order to be able to prioritize
|
||||||
|
# longer matches (e.g., "---" before "--")
|
||||||
|
variants = sorted(variants, key=lambda x: len(x))
|
||||||
|
variants.reverse()
|
||||||
|
variant_raw = ""
|
||||||
|
raw_idx = 0
|
||||||
|
# add initial whitespace
|
||||||
|
while raw_idx < len(raw) and raw[raw_idx].isspace():
|
||||||
|
variant_raw += raw[raw_idx]
|
||||||
|
raw_idx += 1
|
||||||
|
for word in variant_example.token_annotation.words:
|
||||||
|
match_found = False
|
||||||
|
# skip whitespace words
|
||||||
|
if word.isspace():
|
||||||
|
match_found = True
|
||||||
|
# add identical word
|
||||||
|
elif word not in variants and raw[raw_idx:].startswith(word):
|
||||||
|
variant_raw += word
|
||||||
|
raw_idx += len(word)
|
||||||
|
match_found = True
|
||||||
|
# add variant word
|
||||||
|
else:
|
||||||
|
for variant in variants:
|
||||||
|
if not match_found and raw[raw_idx:].startswith(variant):
|
||||||
|
raw_idx += len(variant)
|
||||||
|
variant_raw += word
|
||||||
|
match_found = True
|
||||||
|
# something went wrong, abort
|
||||||
|
# (add a warning message?)
|
||||||
|
if not match_found:
|
||||||
|
return example
|
||||||
|
# add following whitespace
|
||||||
|
while raw_idx < len(raw) and raw[raw_idx].isspace():
|
||||||
|
variant_raw += raw[raw_idx]
|
||||||
|
raw_idx += 1
|
||||||
|
variant_example.doc = variant_raw
|
||||||
|
return variant_example
|
||||||
|
return variant_example
|
||||||
|
|
||||||
|
|
||||||
|
def add_noise(orig, noise_level):
|
||||||
|
if random.random() >= noise_level:
|
||||||
|
return orig
|
||||||
|
elif type(orig) == list:
|
||||||
|
corrupted = [_corrupt(word, noise_level) for word in orig]
|
||||||
|
corrupted = [w for w in corrupted if w]
|
||||||
|
return corrupted
|
||||||
|
else:
|
||||||
|
return "".join(_corrupt(c, noise_level) for c in orig)
|
||||||
|
|
||||||
|
|
||||||
|
def _corrupt(c, noise_level):
|
||||||
|
if random.random() >= noise_level:
|
||||||
|
return c
|
||||||
|
elif c in [".", "'", "!", "?", ","]:
|
||||||
|
return "\n"
|
||||||
|
else:
|
||||||
|
return c.lower()
|
226
spacy/gold/corpus.py
Normal file
226
spacy/gold/corpus.py
Normal file
|
@ -0,0 +1,226 @@
|
||||||
|
import random
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import srsly
|
||||||
|
from pathlib import Path
|
||||||
|
import itertools
|
||||||
|
from ..tokens import Doc
|
||||||
|
from .. import util
|
||||||
|
from ..errors import Errors, AlignmentError
|
||||||
|
from .gold_io import read_json_file, json_to_annotations
|
||||||
|
from .augment import make_orth_variants, add_noise
|
||||||
|
from .new_example import NewExample as Example
|
||||||
|
|
||||||
|
|
||||||
|
class GoldCorpus(object):
|
||||||
|
"""An annotated corpus, using the JSON file format. Manages
|
||||||
|
annotations for tagging, dependency parsing and NER.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/goldcorpus
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, train, dev, gold_preproc=False, limit=None):
|
||||||
|
"""Create a GoldCorpus.
|
||||||
|
|
||||||
|
train (str / Path): File or directory of training data.
|
||||||
|
dev (str / Path): File or directory of development data.
|
||||||
|
RETURNS (GoldCorpus): The newly created object.
|
||||||
|
"""
|
||||||
|
self.limit = limit
|
||||||
|
if isinstance(train, str) or isinstance(train, Path):
|
||||||
|
train = self.read_annotations(self.walk_corpus(train))
|
||||||
|
dev = self.read_annotations(self.walk_corpus(dev))
|
||||||
|
# Write temp directory with one doc per file, so we can shuffle and stream
|
||||||
|
self.tmp_dir = Path(tempfile.mkdtemp())
|
||||||
|
self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
|
||||||
|
self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
shutil.rmtree(self.tmp_dir)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def write_msgpack(directory, examples, limit=0):
|
||||||
|
if not directory.exists():
|
||||||
|
directory.mkdir()
|
||||||
|
n = 0
|
||||||
|
for i, ex_dict in enumerate(examples):
|
||||||
|
text = ex_dict["text"]
|
||||||
|
srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
|
||||||
|
n += 1
|
||||||
|
if limit and n >= limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def walk_corpus(path):
|
||||||
|
path = util.ensure_path(path)
|
||||||
|
if not path.is_dir():
|
||||||
|
return [path]
|
||||||
|
paths = [path]
|
||||||
|
locs = []
|
||||||
|
seen = set()
|
||||||
|
for path in paths:
|
||||||
|
if str(path) in seen:
|
||||||
|
continue
|
||||||
|
seen.add(str(path))
|
||||||
|
if path.parts[-1].startswith("."):
|
||||||
|
continue
|
||||||
|
elif path.is_dir():
|
||||||
|
paths.extend(path.iterdir())
|
||||||
|
elif path.parts[-1].endswith((".json", ".jsonl")):
|
||||||
|
locs.append(path)
|
||||||
|
return locs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def read_annotations(locs, limit=0):
|
||||||
|
""" Yield training examples """
|
||||||
|
i = 0
|
||||||
|
for loc in locs:
|
||||||
|
loc = util.ensure_path(loc)
|
||||||
|
file_name = loc.parts[-1]
|
||||||
|
if file_name.endswith("json"):
|
||||||
|
examples = read_json_file(loc)
|
||||||
|
elif file_name.endswith("jsonl"):
|
||||||
|
gold_tuples = srsly.read_jsonl(loc)
|
||||||
|
first_gold_tuple = next(gold_tuples)
|
||||||
|
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
|
||||||
|
# TODO: proper format checks with schemas
|
||||||
|
if isinstance(first_gold_tuple, dict):
|
||||||
|
if first_gold_tuple.get("paragraphs", None):
|
||||||
|
examples = []
|
||||||
|
for json_doc in gold_tuples:
|
||||||
|
examples.extend(json_to_annotations(json_doc))
|
||||||
|
elif first_gold_tuple.get("doc_annotation", None):
|
||||||
|
examples = []
|
||||||
|
for ex_dict in gold_tuples:
|
||||||
|
doc = ex_dict.get("doc", None)
|
||||||
|
if doc is None:
|
||||||
|
doc = ex_dict.get("text", None)
|
||||||
|
if not (
|
||||||
|
doc is None
|
||||||
|
or isinstance(doc, Doc)
|
||||||
|
or isinstance(doc, str)
|
||||||
|
):
|
||||||
|
raise ValueError(Errors.E987.format(type=type(doc)))
|
||||||
|
examples.append(ex_dict)
|
||||||
|
|
||||||
|
elif file_name.endswith("msg"):
|
||||||
|
text, ex_dict = srsly.read_msgpack(loc)
|
||||||
|
examples = [ex_dict]
|
||||||
|
else:
|
||||||
|
supported = ("json", "jsonl", "msg")
|
||||||
|
raise ValueError(Errors.E124.format(path=loc, formats=supported))
|
||||||
|
try:
|
||||||
|
for example in examples:
|
||||||
|
yield example
|
||||||
|
i += 1
|
||||||
|
if limit and i >= limit:
|
||||||
|
return
|
||||||
|
except KeyError as e:
|
||||||
|
msg = "Missing key {}".format(e)
|
||||||
|
raise KeyError(Errors.E996.format(file=file_name, msg=msg))
|
||||||
|
except UnboundLocalError as e:
|
||||||
|
msg = "Unexpected document structure"
|
||||||
|
raise ValueError(Errors.E996.format(file=file_name, msg=msg))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def dev_annotations(self):
|
||||||
|
locs = (self.tmp_dir / "dev").iterdir()
|
||||||
|
yield from self.read_annotations(locs, limit=self.limit)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def train_annotations(self):
|
||||||
|
locs = (self.tmp_dir / "train").iterdir()
|
||||||
|
yield from self.read_annotations(locs, limit=self.limit)
|
||||||
|
|
||||||
|
def count_train(self):
|
||||||
|
"""Returns count of words in train examples"""
|
||||||
|
n = 0
|
||||||
|
i = 0
|
||||||
|
for eg_dict in self.train_annotations:
|
||||||
|
n += len(eg_dict["token_annotation"]["words"])
|
||||||
|
if self.limit and i >= self.limit:
|
||||||
|
break
|
||||||
|
i += 1
|
||||||
|
return n
|
||||||
|
|
||||||
|
def train_dataset(
|
||||||
|
self,
|
||||||
|
nlp,
|
||||||
|
gold_preproc=False,
|
||||||
|
max_length=None,
|
||||||
|
noise_level=0.0,
|
||||||
|
orth_variant_level=0.0,
|
||||||
|
ignore_misaligned=False,
|
||||||
|
):
|
||||||
|
locs = list((self.tmp_dir / "train").iterdir())
|
||||||
|
random.shuffle(locs)
|
||||||
|
train_annotations = self.read_annotations(locs, limit=self.limit)
|
||||||
|
examples = self.iter_examples(
|
||||||
|
nlp,
|
||||||
|
train_annotations,
|
||||||
|
gold_preproc,
|
||||||
|
max_length=max_length,
|
||||||
|
noise_level=noise_level,
|
||||||
|
orth_variant_level=orth_variant_level,
|
||||||
|
make_projective=True,
|
||||||
|
ignore_misaligned=ignore_misaligned,
|
||||||
|
)
|
||||||
|
yield from examples
|
||||||
|
|
||||||
|
def train_dataset_without_preprocessing(
|
||||||
|
self, nlp, gold_preproc=False, ignore_misaligned=False
|
||||||
|
):
|
||||||
|
examples = self.iter_examples(
|
||||||
|
nlp,
|
||||||
|
self.train_annotations,
|
||||||
|
gold_preproc=gold_preproc,
|
||||||
|
ignore_misaligned=ignore_misaligned,
|
||||||
|
)
|
||||||
|
yield from examples
|
||||||
|
|
||||||
|
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
|
||||||
|
examples = self.iter_examples(
|
||||||
|
nlp,
|
||||||
|
self.dev_annotations,
|
||||||
|
gold_preproc=gold_preproc,
|
||||||
|
ignore_misaligned=ignore_misaligned,
|
||||||
|
)
|
||||||
|
yield from examples
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def iter_examples(
|
||||||
|
cls,
|
||||||
|
nlp,
|
||||||
|
annotations,
|
||||||
|
gold_preproc,
|
||||||
|
max_length=None,
|
||||||
|
noise_level=0.0,
|
||||||
|
orth_variant_level=0.0,
|
||||||
|
make_projective=False,
|
||||||
|
ignore_misaligned=False,
|
||||||
|
):
|
||||||
|
""" Setting gold_preproc will result in creating a doc per sentence """
|
||||||
|
for eg_dict in annotations:
|
||||||
|
if eg_dict["text"]:
|
||||||
|
example = Example.from_dict(
|
||||||
|
nlp.make_doc(eg_dict["text"]),
|
||||||
|
eg_dict
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
example = Example.from_dict(
|
||||||
|
Doc(nlp.vocab, words=eg_dict["words"]),
|
||||||
|
eg_dict
|
||||||
|
)
|
||||||
|
if gold_preproc:
|
||||||
|
# TODO: Data augmentation
|
||||||
|
examples = example.split_sents()
|
||||||
|
else:
|
||||||
|
examples = [example]
|
||||||
|
for ex in examples:
|
||||||
|
if (not max_length) or len(ex.predicted) < max_length:
|
||||||
|
if ignore_misaligned:
|
||||||
|
try:
|
||||||
|
_ = ex._deprecated_get_gold()
|
||||||
|
except AlignmentError:
|
||||||
|
continue
|
||||||
|
yield ex
|
261
spacy/gold/example.py
Normal file
261
spacy/gold/example.py
Normal file
|
@ -0,0 +1,261 @@
|
||||||
|
import numpy
|
||||||
|
from .annotation import TokenAnnotation, DocAnnotation
|
||||||
|
from .iob_utils import spans_from_biluo_tags, biluo_tags_from_offsets
|
||||||
|
from .align import Alignment
|
||||||
|
from ..errors import Errors, AlignmentError
|
||||||
|
from ..tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
def annotations2doc(doc, doc_annot, tok_annot):
|
||||||
|
# TODO: Improve and test this
|
||||||
|
words = tok_annot.words or [tok.text for tok in doc]
|
||||||
|
fields = {
|
||||||
|
"tags": "TAG",
|
||||||
|
"pos": "POS",
|
||||||
|
"lemmas": "LEMMA",
|
||||||
|
"deps": "DEP",
|
||||||
|
}
|
||||||
|
attrs = []
|
||||||
|
values = []
|
||||||
|
for field, attr in fields.items():
|
||||||
|
value = getattr(tok_annot, field)
|
||||||
|
# Unset fields will be empty lists.
|
||||||
|
if value:
|
||||||
|
attrs.append(attr)
|
||||||
|
values.append([doc.vocab.strings.add(v) for v in value])
|
||||||
|
if tok_annot.heads:
|
||||||
|
attrs.append("HEAD")
|
||||||
|
values.append([h - i for i, h in enumerate(tok_annot.heads)])
|
||||||
|
output = Doc(doc.vocab, words=words)
|
||||||
|
if values:
|
||||||
|
array = numpy.array(values, dtype="uint64")
|
||||||
|
output = output.from_array(attrs, array.T)
|
||||||
|
if tok_annot.entities:
|
||||||
|
output.ents = spans_from_biluo_tags(output, tok_annot.entities)
|
||||||
|
doc.cats = dict(doc_annot.cats)
|
||||||
|
# TODO: Calculate token.ent_kb_id from links.
|
||||||
|
# We need to fix this and the doc.ents thing, both should be doc
|
||||||
|
# annotations.
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
class Example:
|
||||||
|
def __init__(self, doc, doc_annotation=None, token_annotation=None):
|
||||||
|
""" Doc can either be text, or an actual Doc """
|
||||||
|
if not isinstance(doc, Doc):
|
||||||
|
raise TypeError("Must pass Doc instance")
|
||||||
|
self.predicted = doc
|
||||||
|
self.doc = doc
|
||||||
|
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
|
||||||
|
self.token_annotation = (
|
||||||
|
token_annotation if token_annotation else TokenAnnotation()
|
||||||
|
)
|
||||||
|
self._alignment = None
|
||||||
|
self.reference = annotations2doc(
|
||||||
|
self.doc,
|
||||||
|
self.doc_annotation,
|
||||||
|
self.token_annotation
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def x(self):
|
||||||
|
return self.predicted
|
||||||
|
|
||||||
|
@property
|
||||||
|
def y(self):
|
||||||
|
return self.reference
|
||||||
|
|
||||||
|
def _deprecated_get_gold(self, make_projective=False):
|
||||||
|
from ..syntax.gold_parse import get_parses_from_example
|
||||||
|
|
||||||
|
_, gold = get_parses_from_example(self, make_projective=make_projective)[0]
|
||||||
|
return gold
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, example_dict, doc=None):
|
||||||
|
if example_dict is None:
|
||||||
|
raise ValueError("Example.from_dict expected dict, received None")
|
||||||
|
if doc is None:
|
||||||
|
raise ValueError("Must pass doc")
|
||||||
|
# TODO: This is ridiculous...
|
||||||
|
token_dict = example_dict.get("token_annotation", {})
|
||||||
|
doc_dict = example_dict.get("doc_annotation", {})
|
||||||
|
for key, value in example_dict.items():
|
||||||
|
if key in ("token_annotation", "doc_annotation"):
|
||||||
|
pass
|
||||||
|
elif key in ("cats", "links"):
|
||||||
|
doc_dict[key] = value
|
||||||
|
else:
|
||||||
|
token_dict[key] = value
|
||||||
|
if token_dict.get("entities"):
|
||||||
|
entities = token_dict["entities"]
|
||||||
|
if isinstance(entities[0], (list, tuple)):
|
||||||
|
token_dict["entities"] = biluo_tags_from_offsets(doc, entities)
|
||||||
|
token_annotation = TokenAnnotation.from_dict(token_dict)
|
||||||
|
doc_annotation = DocAnnotation.from_dict(doc_dict)
|
||||||
|
return cls(
|
||||||
|
doc=doc, doc_annotation=doc_annotation, token_annotation=token_annotation
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alignment(self):
|
||||||
|
if self._alignment is None:
|
||||||
|
if self.doc is None:
|
||||||
|
return None
|
||||||
|
spacy_words = [token.orth_ for token in self.predicted]
|
||||||
|
gold_words = [token.orth_ for token in self.reference]
|
||||||
|
if gold_words == []:
|
||||||
|
gold_words = spacy_words
|
||||||
|
self._alignment = Alignment(spacy_words, gold_words)
|
||||||
|
return self._alignment
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
""" Note that this method does NOT export the doc, only the annotations ! """
|
||||||
|
token_dict = self.token_annotation.to_dict()
|
||||||
|
doc_dict = self.doc_annotation.to_dict()
|
||||||
|
return {"token_annotation": token_dict, "doc_annotation": doc_dict}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self):
|
||||||
|
if self.doc is None:
|
||||||
|
return None
|
||||||
|
if isinstance(self.doc, Doc):
|
||||||
|
return self.doc.text
|
||||||
|
return self.doc
|
||||||
|
|
||||||
|
def get_aligned(self, field):
|
||||||
|
"""Return an aligned array for a token annotation field."""
|
||||||
|
if self.doc is None:
|
||||||
|
return self.token_annotation.get_field(field)
|
||||||
|
doc = self.doc
|
||||||
|
if field == "word":
|
||||||
|
return [token.orth_ for token in doc]
|
||||||
|
gold_values = self.token_annotation.get_field(field)
|
||||||
|
alignment = self.alignment
|
||||||
|
i2j_multi = alignment.i2j_multi
|
||||||
|
gold_to_cand = alignment.gold_to_cand
|
||||||
|
cand_to_gold = alignment.cand_to_gold
|
||||||
|
|
||||||
|
output = []
|
||||||
|
for i, gold_i in enumerate(cand_to_gold):
|
||||||
|
if doc[i].text.isspace():
|
||||||
|
output.append(None)
|
||||||
|
elif gold_i is None:
|
||||||
|
if i in i2j_multi:
|
||||||
|
output.append(gold_values[i2j_multi[i]])
|
||||||
|
else:
|
||||||
|
output.append(None)
|
||||||
|
else:
|
||||||
|
output.append(gold_values[gold_i])
|
||||||
|
return output
|
||||||
|
|
||||||
|
def set_doc_annotation(self, cats=None, links=None):
|
||||||
|
if cats:
|
||||||
|
self.doc_annotation.cats = cats
|
||||||
|
if links:
|
||||||
|
self.doc_annotation.links = links
|
||||||
|
|
||||||
|
def split_sents(self):
|
||||||
|
""" Split the token annotations into multiple Examples based on
|
||||||
|
sent_starts and return a list of the new Examples"""
|
||||||
|
if not self.token_annotation.words:
|
||||||
|
return [self]
|
||||||
|
s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
|
||||||
|
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
|
||||||
|
s_brackets = []
|
||||||
|
sent_start_i = 0
|
||||||
|
t = self.token_annotation
|
||||||
|
split_examples = []
|
||||||
|
for i in range(len(t.words)):
|
||||||
|
if i > 0 and t.sent_starts[i] == 1:
|
||||||
|
split_examples.append(
|
||||||
|
Example(
|
||||||
|
doc=Doc(self.doc.vocab, words=s_words),
|
||||||
|
token_annotation=TokenAnnotation(
|
||||||
|
ids=s_ids,
|
||||||
|
words=s_words,
|
||||||
|
tags=s_tags,
|
||||||
|
pos=s_pos,
|
||||||
|
morphs=s_morphs,
|
||||||
|
lemmas=s_lemmas,
|
||||||
|
heads=s_heads,
|
||||||
|
deps=s_deps,
|
||||||
|
entities=s_ents,
|
||||||
|
sent_starts=s_sent_starts,
|
||||||
|
brackets=s_brackets,
|
||||||
|
),
|
||||||
|
doc_annotation=self.doc_annotation
|
||||||
|
)
|
||||||
|
)
|
||||||
|
s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
|
||||||
|
s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
|
||||||
|
s_sent_starts, s_brackets = [], []
|
||||||
|
sent_start_i = i
|
||||||
|
s_ids.append(t.get_id(i))
|
||||||
|
s_words.append(t.get_word(i))
|
||||||
|
s_tags.append(t.get_tag(i))
|
||||||
|
s_pos.append(t.get_pos(i))
|
||||||
|
s_morphs.append(t.get_morph(i))
|
||||||
|
s_lemmas.append(t.get_lemma(i))
|
||||||
|
s_heads.append(t.get_head(i) - sent_start_i)
|
||||||
|
s_deps.append(t.get_dep(i))
|
||||||
|
s_ents.append(t.get_entity(i))
|
||||||
|
s_sent_starts.append(t.get_sent_start(i))
|
||||||
|
for b_end, b_label in t.brackets_by_start.get(i, []):
|
||||||
|
s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
|
||||||
|
i += 1
|
||||||
|
split_examples.append(
|
||||||
|
Example(
|
||||||
|
doc=Doc(self.doc.vocab, words=s_words),
|
||||||
|
token_annotation=TokenAnnotation(
|
||||||
|
ids=s_ids,
|
||||||
|
words=s_words,
|
||||||
|
tags=s_tags,
|
||||||
|
pos=s_pos,
|
||||||
|
morphs=s_morphs,
|
||||||
|
lemmas=s_lemmas,
|
||||||
|
heads=s_heads,
|
||||||
|
deps=s_deps,
|
||||||
|
entities=s_ents,
|
||||||
|
sent_starts=s_sent_starts,
|
||||||
|
brackets=s_brackets,
|
||||||
|
),
|
||||||
|
doc_annotation=self.doc_annotation
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return split_examples
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
|
||||||
|
"""
|
||||||
|
Return a list of Example objects, from a variety of input formats.
|
||||||
|
make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
|
||||||
|
"""
|
||||||
|
if isinstance(examples, Example):
|
||||||
|
return [examples]
|
||||||
|
if isinstance(examples, tuple):
|
||||||
|
examples = [examples]
|
||||||
|
converted_examples = []
|
||||||
|
for ex in examples:
|
||||||
|
if isinstance(ex, Example):
|
||||||
|
converted_examples.append(ex)
|
||||||
|
# convert string to Doc to Example
|
||||||
|
elif isinstance(ex, str):
|
||||||
|
if keep_raw_text:
|
||||||
|
converted_examples.append(Example(doc=ex))
|
||||||
|
else:
|
||||||
|
doc = make_doc(ex)
|
||||||
|
converted_examples.append(Example(doc=doc))
|
||||||
|
# convert tuples to Example
|
||||||
|
elif isinstance(ex, tuple) and len(ex) == 2:
|
||||||
|
doc, gold = ex
|
||||||
|
# convert string to Doc
|
||||||
|
if isinstance(doc, str) and not keep_raw_text:
|
||||||
|
doc = make_doc(doc)
|
||||||
|
converted_examples.append(Example.from_dict(gold, doc=doc))
|
||||||
|
# convert Doc to Example
|
||||||
|
elif isinstance(ex, Doc):
|
||||||
|
converted_examples.append(Example(doc=ex))
|
||||||
|
else:
|
||||||
|
converted_examples.append(ex)
|
||||||
|
return converted_examples
|
198
spacy/gold/gold_io.pyx
Normal file
198
spacy/gold/gold_io.pyx
Normal file
|
@ -0,0 +1,198 @@
|
||||||
|
import warnings
|
||||||
|
import srsly
|
||||||
|
from .. import util
|
||||||
|
from ..errors import Warnings
|
||||||
|
from ..tokens import Token, Doc
|
||||||
|
from .iob_utils import biluo_tags_from_offsets
|
||||||
|
|
||||||
|
|
||||||
|
def merge_sents(sents):
|
||||||
|
m_deps = [[], [], [], [], [], []]
|
||||||
|
m_cats = {}
|
||||||
|
m_brackets = []
|
||||||
|
i = 0
|
||||||
|
for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
|
||||||
|
m_deps[0].extend(id_ + i for id_ in ids)
|
||||||
|
m_deps[1].extend(words)
|
||||||
|
m_deps[2].extend(tags)
|
||||||
|
m_deps[3].extend(head + i for head in heads)
|
||||||
|
m_deps[4].extend(labels)
|
||||||
|
m_deps[5].extend(ner)
|
||||||
|
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
|
||||||
|
for b in brackets)
|
||||||
|
m_cats.update(cats)
|
||||||
|
i += len(ids)
|
||||||
|
return [(m_deps, (m_cats, m_brackets))]
|
||||||
|
|
||||||
|
|
||||||
|
def docs_to_json(docs, id=0, ner_missing_tag="O"):
|
||||||
|
"""Convert a list of Doc objects into the JSON-serializable format used by
|
||||||
|
the spacy train command.
|
||||||
|
|
||||||
|
docs (iterable / Doc): The Doc object(s) to convert.
|
||||||
|
id (int): Id for the JSON.
|
||||||
|
RETURNS (dict): The data in spaCy's JSON format
|
||||||
|
- each input doc will be treated as a paragraph in the output doc
|
||||||
|
"""
|
||||||
|
if isinstance(docs, Doc):
|
||||||
|
docs = [docs]
|
||||||
|
json_doc = {"id": id, "paragraphs": []}
|
||||||
|
for i, doc in enumerate(docs):
|
||||||
|
json_para = {'raw': doc.text, "sentences": [], "cats": []}
|
||||||
|
for cat, val in doc.cats.items():
|
||||||
|
json_cat = {"label": cat, "value": val}
|
||||||
|
json_para["cats"].append(json_cat)
|
||||||
|
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
||||||
|
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
|
||||||
|
for j, sent in enumerate(doc.sents):
|
||||||
|
json_sent = {"tokens": [], "brackets": []}
|
||||||
|
for token in sent:
|
||||||
|
json_token = {"id": token.i, "orth": token.text}
|
||||||
|
if doc.is_tagged:
|
||||||
|
json_token["tag"] = token.tag_
|
||||||
|
json_token["pos"] = token.pos_
|
||||||
|
json_token["morph"] = token.morph_
|
||||||
|
json_token["lemma"] = token.lemma_
|
||||||
|
if doc.is_parsed:
|
||||||
|
json_token["head"] = token.head.i-token.i
|
||||||
|
json_token["dep"] = token.dep_
|
||||||
|
json_token["ner"] = biluo_tags[token.i]
|
||||||
|
json_sent["tokens"].append(json_token)
|
||||||
|
json_para["sentences"].append(json_sent)
|
||||||
|
json_doc["paragraphs"].append(json_para)
|
||||||
|
return json_doc
|
||||||
|
|
||||||
|
|
||||||
|
def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
|
loc = util.ensure_path(loc)
|
||||||
|
if loc.is_dir():
|
||||||
|
for filename in loc.iterdir():
|
||||||
|
yield from read_json_file(loc / filename, limit=limit)
|
||||||
|
else:
|
||||||
|
for doc in json_iterate(loc):
|
||||||
|
if docs_filter is not None and not docs_filter(doc):
|
||||||
|
continue
|
||||||
|
for json_data in json_to_annotations(doc):
|
||||||
|
yield json_data
|
||||||
|
|
||||||
|
|
||||||
|
def json_to_annotations(doc):
|
||||||
|
"""Convert an item in the JSON-formatted training data to the format
|
||||||
|
used by GoldParse.
|
||||||
|
|
||||||
|
doc (dict): One entry in the training data.
|
||||||
|
YIELDS (tuple): The reformatted data - one training example per paragraph
|
||||||
|
"""
|
||||||
|
for paragraph in doc["paragraphs"]:
|
||||||
|
example = {"text": paragraph.get("raw", None)}
|
||||||
|
words = []
|
||||||
|
ids = []
|
||||||
|
tags = []
|
||||||
|
pos = []
|
||||||
|
morphs = []
|
||||||
|
lemmas = []
|
||||||
|
heads = []
|
||||||
|
labels = []
|
||||||
|
ner = []
|
||||||
|
sent_starts = []
|
||||||
|
brackets = []
|
||||||
|
for sent in paragraph["sentences"]:
|
||||||
|
sent_start_i = len(words)
|
||||||
|
for i, token in enumerate(sent["tokens"]):
|
||||||
|
words.append(token["orth"])
|
||||||
|
ids.append(token.get('id', sent_start_i + i))
|
||||||
|
tags.append(token.get('tag', "-"))
|
||||||
|
pos.append(token.get("pos", ""))
|
||||||
|
morphs.append(token.get("morph", ""))
|
||||||
|
lemmas.append(token.get("lemma", ""))
|
||||||
|
heads.append(token.get("head", 0) + sent_start_i + i)
|
||||||
|
labels.append(token.get("dep", ""))
|
||||||
|
# Ensure ROOT label is case-insensitive
|
||||||
|
if labels[-1].lower() == "root":
|
||||||
|
labels[-1] = "ROOT"
|
||||||
|
ner.append(token.get("ner", "-"))
|
||||||
|
if i == 0:
|
||||||
|
sent_starts.append(1)
|
||||||
|
else:
|
||||||
|
sent_starts.append(0)
|
||||||
|
if "brackets" in sent:
|
||||||
|
brackets.extend((b["first"] + sent_start_i,
|
||||||
|
b["last"] + sent_start_i, b["label"])
|
||||||
|
for b in sent["brackets"])
|
||||||
|
cats = {}
|
||||||
|
for cat in paragraph.get("cats", {}):
|
||||||
|
cats[cat["label"]] = cat["value"]
|
||||||
|
example["token_annotation"] = dict(
|
||||||
|
ids=ids,
|
||||||
|
words=words,
|
||||||
|
tags=tags,
|
||||||
|
pos=pos,
|
||||||
|
morphs=morphs,
|
||||||
|
lemmas=lemmas,
|
||||||
|
heads=heads,
|
||||||
|
deps=labels,
|
||||||
|
entities=ner,
|
||||||
|
sent_starts=sent_starts,
|
||||||
|
brackets=brackets
|
||||||
|
)
|
||||||
|
example["doc_annotation"] = dict(cats=cats)
|
||||||
|
yield example
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def json_iterate(loc):
|
||||||
|
# We should've made these files jsonl...But since we didn't, parse out
|
||||||
|
# the docs one-by-one to reduce memory usage.
|
||||||
|
# It's okay to read in the whole file -- just don't parse it into JSON.
|
||||||
|
cdef bytes py_raw
|
||||||
|
loc = util.ensure_path(loc)
|
||||||
|
with loc.open("rb") as file_:
|
||||||
|
py_raw = file_.read()
|
||||||
|
cdef long file_length = len(py_raw)
|
||||||
|
if file_length > 2 ** 30:
|
||||||
|
warnings.warn(Warnings.W027.format(size=file_length))
|
||||||
|
|
||||||
|
raw = <char*>py_raw
|
||||||
|
cdef int square_depth = 0
|
||||||
|
cdef int curly_depth = 0
|
||||||
|
cdef int inside_string = 0
|
||||||
|
cdef int escape = 0
|
||||||
|
cdef long start = -1
|
||||||
|
cdef char c
|
||||||
|
cdef char quote = ord('"')
|
||||||
|
cdef char backslash = ord("\\")
|
||||||
|
cdef char open_square = ord("[")
|
||||||
|
cdef char close_square = ord("]")
|
||||||
|
cdef char open_curly = ord("{")
|
||||||
|
cdef char close_curly = ord("}")
|
||||||
|
for i in range(file_length):
|
||||||
|
c = raw[i]
|
||||||
|
if escape:
|
||||||
|
escape = False
|
||||||
|
continue
|
||||||
|
if c == backslash:
|
||||||
|
escape = True
|
||||||
|
continue
|
||||||
|
if c == quote:
|
||||||
|
inside_string = not inside_string
|
||||||
|
continue
|
||||||
|
if inside_string:
|
||||||
|
continue
|
||||||
|
if c == open_square:
|
||||||
|
square_depth += 1
|
||||||
|
elif c == close_square:
|
||||||
|
square_depth -= 1
|
||||||
|
elif c == open_curly:
|
||||||
|
if square_depth == 1 and curly_depth == 0:
|
||||||
|
start = i
|
||||||
|
curly_depth += 1
|
||||||
|
elif c == close_curly:
|
||||||
|
curly_depth -= 1
|
||||||
|
if square_depth == 1 and curly_depth == 0:
|
||||||
|
py_str = py_raw[start : i + 1].decode("utf8")
|
||||||
|
try:
|
||||||
|
yield srsly.json_loads(py_str)
|
||||||
|
except Exception:
|
||||||
|
print(py_str)
|
||||||
|
raise
|
||||||
|
start = -1
|
197
spacy/gold/iob_utils.py
Normal file
197
spacy/gold/iob_utils.py
Normal file
|
@ -0,0 +1,197 @@
|
||||||
|
import warnings
|
||||||
|
from ..errors import Errors, Warnings
|
||||||
|
from ..tokens import Span
|
||||||
|
|
||||||
|
|
||||||
|
def iob_to_biluo(tags):
|
||||||
|
out = []
|
||||||
|
tags = list(tags)
|
||||||
|
while tags:
|
||||||
|
out.extend(_consume_os(tags))
|
||||||
|
out.extend(_consume_ent(tags))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def biluo_to_iob(tags):
|
||||||
|
out = []
|
||||||
|
for tag in tags:
|
||||||
|
tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1)
|
||||||
|
out.append(tag)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _consume_os(tags):
|
||||||
|
while tags and tags[0] == "O":
|
||||||
|
yield tags.pop(0)
|
||||||
|
|
||||||
|
|
||||||
|
def _consume_ent(tags):
|
||||||
|
if not tags:
|
||||||
|
return []
|
||||||
|
tag = tags.pop(0)
|
||||||
|
target_in = "I" + tag[1:]
|
||||||
|
target_last = "L" + tag[1:]
|
||||||
|
length = 1
|
||||||
|
while tags and tags[0] in {target_in, target_last}:
|
||||||
|
length += 1
|
||||||
|
tags.pop(0)
|
||||||
|
label = tag[2:]
|
||||||
|
if length == 1:
|
||||||
|
if len(label) == 0:
|
||||||
|
raise ValueError(Errors.E177.format(tag=tag))
|
||||||
|
return ["U-" + label]
|
||||||
|
else:
|
||||||
|
start = "B-" + label
|
||||||
|
end = "L-" + label
|
||||||
|
middle = [f"I-{label}" for _ in range(1, length - 1)]
|
||||||
|
return [start] + middle + [end]
|
||||||
|
|
||||||
|
|
||||||
|
def biluo_tags_from_doc(doc, missing="O"):
|
||||||
|
return biluo_tags_from_offsets(
|
||||||
|
doc,
|
||||||
|
[(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
|
||||||
|
missing=missing
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||||
|
"""Encode labelled spans into per-token tags, using the
|
||||||
|
Begin/In/Last/Unit/Out scheme (BILUO).
|
||||||
|
|
||||||
|
doc (Doc): The document that the entity offsets refer to. The output tags
|
||||||
|
will refer to the token boundaries within the document.
|
||||||
|
entities (iterable): A sequence of `(start, end, label)` triples. `start`
|
||||||
|
and `end` should be character-offset integers denoting the slice into
|
||||||
|
the original string.
|
||||||
|
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
||||||
|
string will be of the form either "", "O" or "{action}-{label}", where
|
||||||
|
action is one of "B", "I", "L", "U". The string "-" is used where the
|
||||||
|
entity offsets don't align with the tokenization in the `Doc` object.
|
||||||
|
The training algorithm will view these as missing values. "O" denotes a
|
||||||
|
non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||||
|
"I" the inside of an entity of three or more tokens, and "L" the end
|
||||||
|
of an entity of two or more tokens. "U" denotes a single-token entity.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
>>> text = 'I like London.'
|
||||||
|
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||||
|
>>> doc = nlp.tokenizer(text)
|
||||||
|
>>> tags = biluo_tags_from_offsets(doc, entities)
|
||||||
|
>>> assert tags == ["O", "O", 'U-LOC', "O"]
|
||||||
|
"""
|
||||||
|
# Ensure no overlapping entity labels exist
|
||||||
|
tokens_in_ents = {}
|
||||||
|
|
||||||
|
starts = {token.idx: token.i for token in doc}
|
||||||
|
ends = {token.idx + len(token): token.i for token in doc}
|
||||||
|
biluo = ["-" for _ in doc]
|
||||||
|
# Handle entity cases
|
||||||
|
for start_char, end_char, label in entities:
|
||||||
|
for token_index in range(start_char, end_char):
|
||||||
|
if token_index in tokens_in_ents.keys():
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E103.format(
|
||||||
|
span1=(
|
||||||
|
tokens_in_ents[token_index][0],
|
||||||
|
tokens_in_ents[token_index][1],
|
||||||
|
tokens_in_ents[token_index][2],
|
||||||
|
),
|
||||||
|
span2=(start_char, end_char, label),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
tokens_in_ents[token_index] = (start_char, end_char, label)
|
||||||
|
|
||||||
|
start_token = starts.get(start_char)
|
||||||
|
end_token = ends.get(end_char)
|
||||||
|
# Only interested if the tokenization is correct
|
||||||
|
if start_token is not None and end_token is not None:
|
||||||
|
if start_token == end_token:
|
||||||
|
biluo[start_token] = f"U-{label}"
|
||||||
|
else:
|
||||||
|
biluo[start_token] = f"B-{label}"
|
||||||
|
for i in range(start_token + 1, end_token):
|
||||||
|
biluo[i] = f"I-{label}"
|
||||||
|
biluo[end_token] = f"L-{label}"
|
||||||
|
# Now distinguish the O cases from ones where we miss the tokenization
|
||||||
|
entity_chars = set()
|
||||||
|
for start_char, end_char, label in entities:
|
||||||
|
for i in range(start_char, end_char):
|
||||||
|
entity_chars.add(i)
|
||||||
|
for token in doc:
|
||||||
|
for i in range(token.idx, token.idx + len(token)):
|
||||||
|
if i in entity_chars:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
biluo[token.i] = missing
|
||||||
|
if "-" in biluo:
|
||||||
|
ent_str = str(entities)
|
||||||
|
warnings.warn(
|
||||||
|
Warnings.W030.format(
|
||||||
|
text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
|
||||||
|
entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return biluo
|
||||||
|
|
||||||
|
|
||||||
|
def spans_from_biluo_tags(doc, tags):
|
||||||
|
"""Encode per-token tags following the BILUO scheme into Span object, e.g.
|
||||||
|
to overwrite the doc.ents.
|
||||||
|
|
||||||
|
doc (Doc): The document that the BILUO tags refer to.
|
||||||
|
entities (iterable): A sequence of BILUO tags with each tag describing one
|
||||||
|
token. Each tags string will be of the form of either "", "O" or
|
||||||
|
"{action}-{label}", where action is one of "B", "I", "L", "U".
|
||||||
|
RETURNS (list): A sequence of Span objects.
|
||||||
|
"""
|
||||||
|
token_offsets = tags_to_entities(tags)
|
||||||
|
spans = []
|
||||||
|
for label, start_idx, end_idx in token_offsets:
|
||||||
|
span = Span(doc, start_idx, end_idx + 1, label=label)
|
||||||
|
spans.append(span)
|
||||||
|
return spans
|
||||||
|
|
||||||
|
|
||||||
|
def offsets_from_biluo_tags(doc, tags):
|
||||||
|
"""Encode per-token tags following the BILUO scheme into entity offsets.
|
||||||
|
|
||||||
|
doc (Doc): The document that the BILUO tags refer to.
|
||||||
|
entities (iterable): A sequence of BILUO tags with each tag describing one
|
||||||
|
token. Each tags string will be of the form of either "", "O" or
|
||||||
|
"{action}-{label}", where action is one of "B", "I", "L", "U".
|
||||||
|
RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
|
||||||
|
`end` will be character-offset integers denoting the slice into the
|
||||||
|
original string.
|
||||||
|
"""
|
||||||
|
spans = spans_from_biluo_tags(doc, tags)
|
||||||
|
return [(span.start_char, span.end_char, span.label_) for span in spans]
|
||||||
|
|
||||||
|
|
||||||
|
def tags_to_entities(tags):
|
||||||
|
entities = []
|
||||||
|
start = None
|
||||||
|
for i, tag in enumerate(tags):
|
||||||
|
if tag is None:
|
||||||
|
continue
|
||||||
|
if tag.startswith("O"):
|
||||||
|
# TODO: We shouldn't be getting these malformed inputs. Fix this.
|
||||||
|
if start is not None:
|
||||||
|
start = None
|
||||||
|
continue
|
||||||
|
elif tag == "-":
|
||||||
|
continue
|
||||||
|
elif tag.startswith("I"):
|
||||||
|
if start is None:
|
||||||
|
raise ValueError(Errors.E067.format(tags=tags[: i + 1]))
|
||||||
|
continue
|
||||||
|
if tag.startswith("U"):
|
||||||
|
entities.append((tag[2:], i, i))
|
||||||
|
elif tag.startswith("B"):
|
||||||
|
start = i
|
||||||
|
elif tag.startswith("L"):
|
||||||
|
entities.append((tag[2:], start, i))
|
||||||
|
start = None
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E068.format(tag=tag))
|
||||||
|
return entities
|
8
spacy/gold/new_example.pxd
Normal file
8
spacy/gold/new_example.pxd
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
from ..tokens.doc cimport Doc
|
||||||
|
from .align cimport Alignment
|
||||||
|
|
||||||
|
|
||||||
|
cdef class NewExample:
|
||||||
|
cdef readonly Doc x
|
||||||
|
cdef readonly Doc y
|
||||||
|
cdef readonly Alignment _alignment
|
434
spacy/gold/new_example.pyx
Normal file
434
spacy/gold/new_example.pyx
Normal file
|
@ -0,0 +1,434 @@
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
from ..tokens import Token
|
||||||
|
from ..tokens.doc cimport Doc
|
||||||
|
from ..attrs import IDS
|
||||||
|
from .align cimport Alignment
|
||||||
|
from .annotation import TokenAnnotation, DocAnnotation
|
||||||
|
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
||||||
|
from .align import Alignment
|
||||||
|
from ..errors import Errors, AlignmentError
|
||||||
|
|
||||||
|
|
||||||
|
cpdef Doc annotations2doc(Doc predicted, tok_annot, doc_annot):
|
||||||
|
# TODO: Improve and test this
|
||||||
|
words = tok_annot.get("ORTH", [tok.text for tok in predicted])
|
||||||
|
attrs, array = _annot2array(predicted.vocab, tok_annot, doc_annot)
|
||||||
|
output = Doc(predicted.vocab, words=words)
|
||||||
|
if array.size:
|
||||||
|
output = output.from_array(attrs, array)
|
||||||
|
output.cats.update(doc_annot.get("cats", {}))
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
cdef class NewExample:
|
||||||
|
def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
|
||||||
|
""" Doc can either be text, or an actual Doc """
|
||||||
|
msg = "Example.__init__ got None for '{arg}'. Requires Doc."
|
||||||
|
if predicted is None:
|
||||||
|
raise TypeError(msg.format(arg="predicted"))
|
||||||
|
if reference is None:
|
||||||
|
raise TypeError(msg.format(arg="reference"))
|
||||||
|
self.x = predicted
|
||||||
|
self.y = reference
|
||||||
|
self._alignment = alignment
|
||||||
|
|
||||||
|
property predicted:
|
||||||
|
def __get__(self):
|
||||||
|
return self.x
|
||||||
|
|
||||||
|
def __set__(self, doc):
|
||||||
|
self.x = doc
|
||||||
|
|
||||||
|
property reference:
|
||||||
|
def __get__(self):
|
||||||
|
return self.y
|
||||||
|
|
||||||
|
def __set__(self, doc):
|
||||||
|
self.y = doc
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, Doc predicted, dict example_dict):
|
||||||
|
if example_dict is None:
|
||||||
|
raise ValueError("Example.from_dict expected dict, received None")
|
||||||
|
if not isinstance(predicted, Doc):
|
||||||
|
raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}")
|
||||||
|
example_dict = _fix_legacy_dict_data(predicted, example_dict)
|
||||||
|
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||||
|
return NewExample(
|
||||||
|
predicted,
|
||||||
|
annotations2doc(predicted, tok_dict, doc_dict)
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def alignment(self):
|
||||||
|
if self._alignment is None:
|
||||||
|
if self.doc is None:
|
||||||
|
return None
|
||||||
|
spacy_words = [token.orth_ for token in self.predicted]
|
||||||
|
gold_words = [token.orth_ for token in self.reference]
|
||||||
|
if gold_words == []:
|
||||||
|
gold_words = spacy_words
|
||||||
|
self._alignment = Alignment(spacy_words, gold_words)
|
||||||
|
return self._alignment
|
||||||
|
|
||||||
|
def get_aligned(self, field):
|
||||||
|
"""Return an aligned array for a token attribute."""
|
||||||
|
# TODO: This is probably wrong. I just bashed this out and there's probably
|
||||||
|
# all sorts of edge-cases.
|
||||||
|
alignment = self.alignment
|
||||||
|
i2j_multi = alignment.i2j_multi
|
||||||
|
gold_to_cand = alignment.gold_to_cand
|
||||||
|
cand_to_gold = alignment.cand_to_gold
|
||||||
|
|
||||||
|
gold_values = self.reference.to_array([field])
|
||||||
|
output = []
|
||||||
|
for i, gold_i in enumerate(cand_to_gold):
|
||||||
|
if self.predicted[i].text.isspace():
|
||||||
|
output.append(None)
|
||||||
|
elif gold_i is None:
|
||||||
|
if i in i2j_multi:
|
||||||
|
output.append(gold_values[i2j_multi[i]])
|
||||||
|
else:
|
||||||
|
output.append(None)
|
||||||
|
else:
|
||||||
|
output.append(gold_values[gold_i])
|
||||||
|
return output
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
"doc_annotation": {
|
||||||
|
"cats": dict(self.reference.cats),
|
||||||
|
"links": [], # TODO
|
||||||
|
},
|
||||||
|
"token_annotation": {
|
||||||
|
"ids": [t.i+1 for t in self.reference],
|
||||||
|
"words": [t.text for t in self.reference],
|
||||||
|
"tags": [t.tag_ for t in self.reference],
|
||||||
|
"lemmas": [t.lemma_ for t in self.reference],
|
||||||
|
"pos": [t.pos_ for t in self.reference],
|
||||||
|
"morphs": [t.morph_ for t in self.reference],
|
||||||
|
"heads": [t.head.i for t in self.reference],
|
||||||
|
"deps": [t.dep_ for t in self.reference],
|
||||||
|
"sent_starts": [int(bool(t.is_sent_start)) for t in self.reference],
|
||||||
|
"entities": biluo_tags_from_doc(self.reference)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def split_sents(self):
|
||||||
|
""" Split the token annotations into multiple Examples based on
|
||||||
|
sent_starts and return a list of the new Examples"""
|
||||||
|
if not self.reference.is_sentenced:
|
||||||
|
return [self]
|
||||||
|
# TODO: Do this for misaligned somehow?
|
||||||
|
predicted_words = [t.text for t in self.predicted]
|
||||||
|
reference_words = [t.text for t in self.reference]
|
||||||
|
if predicted_words != reference_words:
|
||||||
|
raise NotImplementedError("TODO: Implement this")
|
||||||
|
# Implement the easy case.
|
||||||
|
output = []
|
||||||
|
cls = self.__class__
|
||||||
|
for sent in self.reference.sents:
|
||||||
|
# I guess for misaligned we just need to use the gold_to_cand?
|
||||||
|
output.append(
|
||||||
|
cls(
|
||||||
|
self.predicted[sent.start : sent.end + 1].as_doc(),
|
||||||
|
sent.as_doc()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def text(self):
|
||||||
|
return self.x.text
|
||||||
|
|
||||||
|
|
||||||
|
def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
|
attrs = []
|
||||||
|
values = []
|
||||||
|
|
||||||
|
for key, value in doc_annot.items():
|
||||||
|
if key == "entities":
|
||||||
|
words = tok_annot["ORTH"]
|
||||||
|
ent_iobs, ent_types = _parse_ner_tags(vocab, words, value)
|
||||||
|
tok_annot["ENT_IOB"] = ent_iobs
|
||||||
|
tok_annot["ENT_TYPE"] = ent_types
|
||||||
|
elif key == "links":
|
||||||
|
entities = doc_annot.get("entities", {})
|
||||||
|
if value and not entities:
|
||||||
|
raise ValueError(Errors.E984)
|
||||||
|
ent_kb_ids = _parse_links(vocab, words, value, entities)
|
||||||
|
tok_annot["ENT_KB_ID"] = ent_kb_ids
|
||||||
|
elif key == "cats":
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown doc attribute: {key}")
|
||||||
|
|
||||||
|
for key, value in tok_annot.items():
|
||||||
|
if key not in IDS:
|
||||||
|
raise ValueError(f"Unknown token attribute: {key}")
|
||||||
|
elif key == "ORTH":
|
||||||
|
pass
|
||||||
|
elif key == "HEAD":
|
||||||
|
attrs.append(key)
|
||||||
|
values.append([h-i for i, h in enumerate(value)])
|
||||||
|
elif key == "SENT_START":
|
||||||
|
attrs.append(key)
|
||||||
|
values.append(value)
|
||||||
|
elif key == "MORPH":
|
||||||
|
attrs.append(key)
|
||||||
|
values.append([vocab.morphology.add(v) for v in value])
|
||||||
|
elif key == "ENT_IOB":
|
||||||
|
iob_strings = Token.iob_strings()
|
||||||
|
attrs.append(key)
|
||||||
|
try:
|
||||||
|
values.append([iob_strings.index(v) for v in value])
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(Errors.E985.format(values=iob_strings, value=values))
|
||||||
|
else:
|
||||||
|
attrs.append(key)
|
||||||
|
values.append([vocab.strings.add(v) for v in value])
|
||||||
|
|
||||||
|
array = numpy.asarray(values, dtype="uint64")
|
||||||
|
return attrs, array.T
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_example_dict_data(example_dict):
|
||||||
|
return (
|
||||||
|
example_dict["token_annotation"],
|
||||||
|
example_dict["doc_annotation"]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _fix_legacy_dict_data(predicted, example_dict):
|
||||||
|
token_dict = example_dict.get("token_annotation", {})
|
||||||
|
doc_dict = example_dict.get("doc_annotation", {})
|
||||||
|
for key, value in example_dict.items():
|
||||||
|
if key in ("token_annotation", "doc_annotation"):
|
||||||
|
pass
|
||||||
|
elif key == "ids":
|
||||||
|
pass
|
||||||
|
elif key in ("cats", "links") and value:
|
||||||
|
doc_dict[key] = value
|
||||||
|
elif key in ("ner", "entities") and value:
|
||||||
|
doc_dict["entities"] = value
|
||||||
|
else:
|
||||||
|
token_dict[key] = value
|
||||||
|
# Remap keys
|
||||||
|
remapping = {
|
||||||
|
"words": "ORTH",
|
||||||
|
"tags": "TAG",
|
||||||
|
"pos": "POS",
|
||||||
|
"lemmas": "LEMMA",
|
||||||
|
"deps": "DEP",
|
||||||
|
"heads": "HEAD",
|
||||||
|
"sent_starts": "SENT_START",
|
||||||
|
"morphs": "MORPH",
|
||||||
|
}
|
||||||
|
old_token_dict = token_dict
|
||||||
|
token_dict = {}
|
||||||
|
for key, value in old_token_dict.items():
|
||||||
|
if key in ("text", "ids", "entities", "ner", "brackets"):
|
||||||
|
pass
|
||||||
|
elif key in remapping:
|
||||||
|
token_dict[remapping[key]] = value
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown attr: {key}")
|
||||||
|
if "HEAD" in token_dict and "SENT_START" in token_dict:
|
||||||
|
# If heads are set, we don't also redundantly specify SENT_START.
|
||||||
|
token_dict.pop("SENT_START")
|
||||||
|
return {
|
||||||
|
"token_annotation": token_dict,
|
||||||
|
"doc_annotation": doc_dict
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_ner_tags(vocab, words, biluo_or_offsets):
|
||||||
|
if isinstance(biluo_or_offsets[0], (list, tuple)):
|
||||||
|
# Convert to biluo if necessary
|
||||||
|
# This is annoying but to convert the offsets we need a Doc
|
||||||
|
# that has the target tokenization.
|
||||||
|
reference = Doc(vocab, words=words)
|
||||||
|
biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
|
||||||
|
else:
|
||||||
|
biluo = biluo_or_offsets
|
||||||
|
ent_iobs = []
|
||||||
|
ent_types = []
|
||||||
|
for iob_tag in biluo_to_iob(biluo):
|
||||||
|
ent_iobs.append(iob_tag.split("-")[0])
|
||||||
|
if iob_tag.startswith("I") or iob_tag.startswith("B"):
|
||||||
|
ent_types.append(iob_tag.split("-", 1)[1])
|
||||||
|
else:
|
||||||
|
ent_types.append("")
|
||||||
|
return ent_iobs, ent_types
|
||||||
|
|
||||||
|
def _parse_links(vocab, words, links, entities):
|
||||||
|
reference = Doc(vocab, words=words)
|
||||||
|
|
||||||
|
starts = {token.idx: token.i for token in reference}
|
||||||
|
ends = {token.idx + len(token): token.i for token in reference}
|
||||||
|
ent_kb_ids = ["" for _ in reference]
|
||||||
|
entity_map = [(ent[0], ent[1]) for ent in entities]
|
||||||
|
|
||||||
|
# links annotations need to refer 1-1 to entity annotations - throw error otherwise
|
||||||
|
for index, annot_dict in links.items():
|
||||||
|
start_char, end_char = index
|
||||||
|
if (start_char, end_char) not in entity_map:
|
||||||
|
raise ValueError(Errors.E984)
|
||||||
|
|
||||||
|
for index, annot_dict in links.items():
|
||||||
|
true_kb_ids = []
|
||||||
|
for key, value in annot_dict.items():
|
||||||
|
if value == 1.0:
|
||||||
|
true_kb_ids.append(key)
|
||||||
|
if len(true_kb_ids) > 1:
|
||||||
|
raise ValueError(Errors.E983)
|
||||||
|
|
||||||
|
if len(true_kb_ids) == 1:
|
||||||
|
start_char, end_char = index
|
||||||
|
start_token = starts.get(start_char)
|
||||||
|
end_token = ends.get(end_char)
|
||||||
|
for i in range(start_token, end_token+1):
|
||||||
|
ent_kb_ids[i] = true_kb_ids[0]
|
||||||
|
|
||||||
|
return ent_kb_ids
|
||||||
|
|
||||||
|
|
||||||
|
class Example:
|
||||||
|
def get_aligned(self, field):
|
||||||
|
"""Return an aligned array for a token annotation field."""
|
||||||
|
if self.doc is None:
|
||||||
|
return self.token_annotation.get_field(field)
|
||||||
|
doc = self.doc
|
||||||
|
if field == "word":
|
||||||
|
return [token.orth_ for token in doc]
|
||||||
|
gold_values = self.token_annotation.get_field(field)
|
||||||
|
alignment = self.alignment
|
||||||
|
i2j_multi = alignment.i2j_multi
|
||||||
|
gold_to_cand = alignment.gold_to_cand
|
||||||
|
cand_to_gold = alignment.cand_to_gold
|
||||||
|
|
||||||
|
output = []
|
||||||
|
for i, gold_i in enumerate(cand_to_gold):
|
||||||
|
if doc[i].text.isspace():
|
||||||
|
output.append(None)
|
||||||
|
elif gold_i is None:
|
||||||
|
if i in i2j_multi:
|
||||||
|
output.append(gold_values[i2j_multi[i]])
|
||||||
|
else:
|
||||||
|
output.append(None)
|
||||||
|
else:
|
||||||
|
output.append(gold_values[gold_i])
|
||||||
|
return output
|
||||||
|
|
||||||
|
def split_sents(self):
|
||||||
|
""" Split the token annotations into multiple Examples based on
|
||||||
|
sent_starts and return a list of the new Examples"""
|
||||||
|
if not self.token_annotation.words:
|
||||||
|
return [self]
|
||||||
|
s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
|
||||||
|
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
|
||||||
|
s_brackets = []
|
||||||
|
sent_start_i = 0
|
||||||
|
t = self.token_annotation
|
||||||
|
split_examples = []
|
||||||
|
for i in range(len(t.words)):
|
||||||
|
if i > 0 and t.sent_starts[i] == 1:
|
||||||
|
split_examples.append(
|
||||||
|
Example(
|
||||||
|
doc=Doc(self.doc.vocab, words=s_words),
|
||||||
|
token_annotation=TokenAnnotation(
|
||||||
|
ids=s_ids,
|
||||||
|
words=s_words,
|
||||||
|
tags=s_tags,
|
||||||
|
pos=s_pos,
|
||||||
|
morphs=s_morphs,
|
||||||
|
lemmas=s_lemmas,
|
||||||
|
heads=s_heads,
|
||||||
|
deps=s_deps,
|
||||||
|
entities=s_ents,
|
||||||
|
sent_starts=s_sent_starts,
|
||||||
|
brackets=s_brackets,
|
||||||
|
),
|
||||||
|
doc_annotation=self.doc_annotation
|
||||||
|
)
|
||||||
|
)
|
||||||
|
s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
|
||||||
|
s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
|
||||||
|
s_sent_starts, s_brackets = [], []
|
||||||
|
sent_start_i = i
|
||||||
|
s_ids.append(t.get_id(i))
|
||||||
|
s_words.append(t.get_word(i))
|
||||||
|
s_tags.append(t.get_tag(i))
|
||||||
|
s_pos.append(t.get_pos(i))
|
||||||
|
s_morphs.append(t.get_morph(i))
|
||||||
|
s_lemmas.append(t.get_lemma(i))
|
||||||
|
s_heads.append(t.get_head(i) - sent_start_i)
|
||||||
|
s_deps.append(t.get_dep(i))
|
||||||
|
s_ents.append(t.get_entity(i))
|
||||||
|
s_sent_starts.append(t.get_sent_start(i))
|
||||||
|
for b_end, b_label in t.brackets_by_start.get(i, []):
|
||||||
|
s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
|
||||||
|
i += 1
|
||||||
|
split_examples.append(
|
||||||
|
Example(
|
||||||
|
doc=Doc(self.doc.vocab, words=s_words),
|
||||||
|
token_annotation=TokenAnnotation(
|
||||||
|
ids=s_ids,
|
||||||
|
words=s_words,
|
||||||
|
tags=s_tags,
|
||||||
|
pos=s_pos,
|
||||||
|
morphs=s_morphs,
|
||||||
|
lemmas=s_lemmas,
|
||||||
|
heads=s_heads,
|
||||||
|
deps=s_deps,
|
||||||
|
entities=s_ents,
|
||||||
|
sent_starts=s_sent_starts,
|
||||||
|
brackets=s_brackets,
|
||||||
|
),
|
||||||
|
doc_annotation=self.doc_annotation
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return split_examples
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
|
||||||
|
"""
|
||||||
|
Return a list of Example objects, from a variety of input formats.
|
||||||
|
make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
|
||||||
|
"""
|
||||||
|
if isinstance(examples, Example):
|
||||||
|
return [examples]
|
||||||
|
if isinstance(examples, tuple):
|
||||||
|
examples = [examples]
|
||||||
|
converted_examples = []
|
||||||
|
for ex in examples:
|
||||||
|
if isinstance(ex, Example):
|
||||||
|
converted_examples.append(ex)
|
||||||
|
# convert string to Doc to Example
|
||||||
|
elif isinstance(ex, str):
|
||||||
|
if keep_raw_text:
|
||||||
|
converted_examples.append(Example(doc=ex))
|
||||||
|
else:
|
||||||
|
doc = make_doc(ex)
|
||||||
|
converted_examples.append(Example(doc=doc))
|
||||||
|
# convert tuples to Example
|
||||||
|
elif isinstance(ex, tuple) and len(ex) == 2:
|
||||||
|
doc, gold = ex
|
||||||
|
# convert string to Doc
|
||||||
|
if isinstance(doc, str) and not keep_raw_text:
|
||||||
|
doc = make_doc(doc)
|
||||||
|
converted_examples.append(Example.from_dict(gold, doc=doc))
|
||||||
|
# convert Doc to Example
|
||||||
|
elif isinstance(ex, Doc):
|
||||||
|
converted_examples.append(Example(doc=ex))
|
||||||
|
else:
|
||||||
|
converted_examples.append(ex)
|
||||||
|
return converted_examples
|
||||||
|
|
||||||
|
def _deprecated_get_gold(self, make_projective=False):
|
||||||
|
from ..syntax.gold_parse import get_parses_from_example
|
||||||
|
|
||||||
|
_, gold = get_parses_from_example(self, make_projective=make_projective)[0]
|
||||||
|
return gold
|
||||||
|
|
||||||
|
|
|
@ -636,6 +636,7 @@ class Language(object):
|
||||||
examples (iterable): `Example` objects.
|
examples (iterable): `Example` objects.
|
||||||
YIELDS (tuple): `Example` objects.
|
YIELDS (tuple): `Example` objects.
|
||||||
"""
|
"""
|
||||||
|
# TODO: This is deprecated right?
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "preprocess_gold"):
|
if hasattr(proc, "preprocess_gold"):
|
||||||
examples = proc.preprocess_gold(examples)
|
examples = proc.preprocess_gold(examples)
|
||||||
|
@ -722,24 +723,26 @@ class Language(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#evaluate
|
DOCS: https://spacy.io/api/language#evaluate
|
||||||
"""
|
"""
|
||||||
examples = Example.to_example_objects(examples, make_doc=self.make_doc)
|
examples = Example.to_example_objects(examples)
|
||||||
if scorer is None:
|
if scorer is None:
|
||||||
scorer = Scorer(pipeline=self.pipeline)
|
scorer = Scorer(pipeline=self.pipeline)
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
|
docs = (eg.predicted for eg in examples)
|
||||||
for name, pipe in self.pipeline:
|
for name, pipe in self.pipeline:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
if not hasattr(pipe, "pipe"):
|
if not hasattr(pipe, "pipe"):
|
||||||
examples = _pipe(examples, pipe, kwargs)
|
docs = _pipe(docs, pipe, kwargs)
|
||||||
else:
|
else:
|
||||||
examples = pipe.pipe(examples, as_example=True, **kwargs)
|
docs = pipe.pipe(docs, **kwargs)
|
||||||
for ex in examples:
|
for doc, eg in zip(docs, examples):
|
||||||
if verbose:
|
if verbose:
|
||||||
print(ex.doc)
|
print(ex.doc)
|
||||||
|
eg.predicted = doc
|
||||||
kwargs = component_cfg.get("scorer", {})
|
kwargs = component_cfg.get("scorer", {})
|
||||||
kwargs.setdefault("verbose", verbose)
|
kwargs.setdefault("verbose", verbose)
|
||||||
scorer.score(ex, **kwargs)
|
scorer.score(eg, **kwargs)
|
||||||
return scorer
|
return scorer
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
|
|
@ -51,9 +51,9 @@ class Morphologizer(Tagger):
|
||||||
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for i, morph in enumerate(example.token_annotation.morphs):
|
for i, token in enumerate(example.reference):
|
||||||
pos = example.token_annotation.get_pos(i)
|
pos = token.pos_
|
||||||
morph = Morphology.feats_to_dict(morph)
|
morph = token.morph
|
||||||
norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
|
norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
|
||||||
if pos:
|
if pos:
|
||||||
morph["POS"] = pos
|
morph["POS"] = pos
|
||||||
|
@ -92,7 +92,7 @@ class Morphologizer(Tagger):
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
||||||
for ex in examples:
|
for ex in examples:
|
||||||
gold = ex.gold
|
gold = ex._deprecated_get_gold()
|
||||||
for i in range(len(gold.morphs)):
|
for i in range(len(gold.morphs)):
|
||||||
pos = gold.pos[i] if i < len(gold.pos) else ""
|
pos = gold.pos[i] if i < len(gold.pos) else ""
|
||||||
morph = gold.morphs[i]
|
morph = gold.morphs[i]
|
||||||
|
|
|
@ -20,7 +20,7 @@ from .defaults import default_nel, default_senter
|
||||||
from .functions import merge_subtokens
|
from .functions import merge_subtokens
|
||||||
from ..language import Language, component
|
from ..language import Language, component
|
||||||
from ..syntax import nonproj
|
from ..syntax import nonproj
|
||||||
from ..gold import Example
|
from ..gold.new_example import NewExample as Example
|
||||||
from ..attrs import POS, ID
|
from ..attrs import POS, ID
|
||||||
from ..util import link_vectors_to_models, create_default_optimizer
|
from ..util import link_vectors_to_models, create_default_optimizer
|
||||||
from ..parts_of_speech import X
|
from ..parts_of_speech import X
|
||||||
|
@ -48,55 +48,38 @@ class Pipe(object):
|
||||||
def from_nlp(cls, nlp, model, **cfg):
|
def from_nlp(cls, nlp, model, **cfg):
|
||||||
return cls(nlp.vocab, model, **cfg)
|
return cls(nlp.vocab, model, **cfg)
|
||||||
|
|
||||||
def _get_doc(self, example):
|
|
||||||
""" Use this method if the `example` can be both a Doc or an Example """
|
|
||||||
if isinstance(example, Doc):
|
|
||||||
return example
|
|
||||||
return example.doc
|
|
||||||
|
|
||||||
def __init__(self, vocab, model, **cfg):
|
def __init__(self, vocab, model, **cfg):
|
||||||
"""Create a new pipe instance."""
|
"""Create a new pipe instance."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def __call__(self, example):
|
def __call__(self, Doc doc):
|
||||||
"""Apply the pipe to one document. The document is
|
"""Apply the pipe to one document. The document is
|
||||||
modified in-place, and returned.
|
modified in-place, and returned.
|
||||||
|
|
||||||
Both __call__ and pipe should delegate to the `predict()`
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
doc = self._get_doc(example)
|
|
||||||
predictions = self.predict([doc])
|
predictions = self.predict([doc])
|
||||||
if isinstance(predictions, tuple) and len(predictions) == 2:
|
if isinstance(predictions, tuple) and len(predictions) == 2:
|
||||||
scores, tensors = predictions
|
scores, tensors = predictions
|
||||||
self.set_annotations([doc], scores, tensors=tensors)
|
self.set_annotations([doc], scores, tensors=tensors)
|
||||||
else:
|
else:
|
||||||
self.set_annotations([doc], predictions)
|
self.set_annotations([doc], predictions)
|
||||||
if isinstance(example, Example):
|
|
||||||
example.doc = doc
|
|
||||||
return example
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
"""Apply the pipe to a stream of documents.
|
"""Apply the pipe to a stream of documents.
|
||||||
|
|
||||||
Both __call__ and pipe should delegate to the `predict()`
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
predictions = self.predict(docs)
|
predictions = self.predict(docs)
|
||||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||||
scores, tensors = predictions
|
scores, tensors = predictions
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
else:
|
else:
|
||||||
self.set_annotations(docs, predictions)
|
self.set_annotations(docs, predictions)
|
||||||
|
|
||||||
if as_example:
|
|
||||||
for ex, doc in zip(examples, docs):
|
|
||||||
ex.doc = doc
|
|
||||||
yield ex
|
|
||||||
else:
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -109,14 +92,13 @@ class Pipe(object):
|
||||||
"""Modify a batch of documents, using pre-computed scores."""
|
"""Modify a batch of documents, using pre-computed scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
def update(self, docs, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
||||||
"""Learn from a batch of documents and gold-standard information,
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
updating the pipe's model.
|
updating the pipe's model.
|
||||||
|
|
||||||
Delegates to predict() and get_loss().
|
Delegates to predict() and get_loss().
|
||||||
"""
|
"""
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = (self._get_doc(ex) for ex in examples)
|
|
||||||
docs = list(self.pipe(docs))
|
docs = list(self.pipe(docs))
|
||||||
|
|
||||||
def rehearse(self, examples, sgd=None, losses=None, **config):
|
def rehearse(self, examples, sgd=None, losses=None, **config):
|
||||||
|
@ -255,28 +237,15 @@ class Tagger(Pipe):
|
||||||
def labels(self):
|
def labels(self):
|
||||||
return tuple(self.vocab.morphology.tag_names)
|
return tuple(self.vocab.morphology.tag_names)
|
||||||
|
|
||||||
def __call__(self, example):
|
def __call__(self, doc):
|
||||||
doc = self._get_doc(example)
|
|
||||||
tags = self.predict([doc])
|
tags = self.predict([doc])
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags)
|
||||||
if isinstance(example, Example):
|
|
||||||
example.doc = doc
|
|
||||||
return example
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
tag_ids = self.predict(docs)
|
tag_ids = self.predict(docs)
|
||||||
assert len(docs) == len(examples)
|
|
||||||
assert len(tag_ids) == len(examples)
|
|
||||||
self.set_annotations(docs, tag_ids)
|
self.set_annotations(docs, tag_ids)
|
||||||
|
|
||||||
if as_example:
|
|
||||||
for ex, doc in zip(examples, docs):
|
|
||||||
ex.doc = doc
|
|
||||||
yield ex
|
|
||||||
else:
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -327,15 +296,17 @@ class Tagger(Pipe):
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
|
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||||
examples = Example.to_example_objects(examples)
|
for eg in examples:
|
||||||
|
assert isinstance(eg, Example)
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
|
|
||||||
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
|
tag_scores, bp_tag_scores = self.model.begin_update(
|
||||||
|
[eg.predicted for eg in examples])
|
||||||
for sc in tag_scores:
|
for sc in tag_scores:
|
||||||
if self.model.ops.xp.isnan(sc.sum()):
|
if self.model.ops.xp.isnan(sc.sum()):
|
||||||
raise ValueError("nan value in scores")
|
raise ValueError("nan value in scores")
|
||||||
|
@ -347,17 +318,16 @@ class Tagger(Pipe):
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [ex.doc for ex in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
self.set_annotations(docs, self._scores2guesses(tag_scores))
|
self.set_annotations(docs, self._scores2guesses(tag_scores))
|
||||||
|
|
||||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
"""Perform a 'rehearsal' update, where we try to match the output of
|
"""Perform a 'rehearsal' update, where we try to match the output of
|
||||||
an initial model.
|
an initial model.
|
||||||
"""
|
"""
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return
|
return
|
||||||
examples = Example.to_example_objects(examples)
|
|
||||||
docs = [ex.doc for ex in examples]
|
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
|
@ -373,7 +343,7 @@ class Tagger(Pipe):
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels)
|
loss_func = SequenceCategoricalCrossentropy(names=self.labels)
|
||||||
truths = [eg.gold.tags for eg in examples]
|
truths = [eg.get_aligned("tag") for eg in examples]
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
if self.model.ops.xp.isnan(loss):
|
if self.model.ops.xp.isnan(loss):
|
||||||
raise ValueError("nan value when computing loss")
|
raise ValueError("nan value when computing loss")
|
||||||
|
@ -387,7 +357,8 @@ class Tagger(Pipe):
|
||||||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||||
new_tag_map = {}
|
new_tag_map = {}
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for tag in example.token_annotation.tags:
|
for token in example.y:
|
||||||
|
tag = token.tag_
|
||||||
if tag in orig_tag_map:
|
if tag in orig_tag_map:
|
||||||
new_tag_map[tag] = orig_tag_map[tag]
|
new_tag_map[tag] = orig_tag_map[tag]
|
||||||
else:
|
else:
|
||||||
|
@ -560,9 +531,9 @@ class SentenceRecognizer(Tagger):
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
||||||
for ex in examples:
|
for eg in examples:
|
||||||
gold = ex.gold
|
sent_starts = eg.get_aligned("sent_start")
|
||||||
for sent_start in gold.sent_starts:
|
for sent_start in sent_starts:
|
||||||
if sent_start is None:
|
if sent_start is None:
|
||||||
correct[idx] = guesses[idx]
|
correct[idx] = guesses[idx]
|
||||||
elif sent_start in tag_index:
|
elif sent_start in tag_index:
|
||||||
|
@ -575,7 +546,7 @@ class SentenceRecognizer(Tagger):
|
||||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
||||||
d_scores *= self.model.ops.asarray(known_labels)
|
d_scores *= self.model.ops.asarray(known_labels)
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
docs = [ex.doc for ex in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
|
@ -686,8 +657,8 @@ class MultitaskObjective(Tagger):
|
||||||
gold_examples = nonproj.preprocess_training_data(get_examples())
|
gold_examples = nonproj.preprocess_training_data(get_examples())
|
||||||
# for raw_text, doc_annot in gold_tuples:
|
# for raw_text, doc_annot in gold_tuples:
|
||||||
for example in gold_examples:
|
for example in gold_examples:
|
||||||
for i in range(len(example.token_annotation.ids)):
|
for token in example.y:
|
||||||
label = self.make_label(i, example.token_annotation)
|
label = self.make_label(token)
|
||||||
if label is not None and label not in self.labels:
|
if label is not None and label not in self.labels:
|
||||||
self.labels[label] = len(self.labels)
|
self.labels[label] = len(self.labels)
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
@ -705,13 +676,13 @@ class MultitaskObjective(Tagger):
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
golds = [ex.gold for ex in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
docs = [ex.doc for ex in examples]
|
for i, eg in enumerate(examples):
|
||||||
for i, gold in enumerate(golds):
|
# Handles alignment for tokenization differences
|
||||||
for j in range(len(docs[i])):
|
doc_annots = eg.get_aligned()
|
||||||
# Handels alignment for tokenization differences
|
for j in range(len(eg.predicted)):
|
||||||
token_annotation = gold.get_token_annotation()
|
tok_annots = {key: values[j] for key, values in tok_annots.items()}
|
||||||
label = self.make_label(j, token_annotation)
|
label = self.make_label(j, tok_annots)
|
||||||
if label is None or label not in self.labels:
|
if label is None or label not in self.labels:
|
||||||
correct[idx] = guesses[idx]
|
correct[idx] = guesses[idx]
|
||||||
else:
|
else:
|
||||||
|
@ -723,83 +694,49 @@ class MultitaskObjective(Tagger):
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_dep(i, token_annotation):
|
def make_dep(token):
|
||||||
if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
|
return token.dep_
|
||||||
return None
|
|
||||||
return token_annotation.deps[i]
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_tag(i, token_annotation):
|
def make_tag(token):
|
||||||
return token_annotation.tags[i]
|
return token.tag_
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_ent(i, token_annotation):
|
def make_ent(token):
|
||||||
if token_annotation.entities is None:
|
if token.ent_iob_ == "O":
|
||||||
return None
|
return "O"
|
||||||
return token_annotation.entities[i]
|
else:
|
||||||
|
return token.ent_iob_ + "-" + token.ent_type_
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_dep_tag_offset(i, token_annotation):
|
def make_dep_tag_offset(token):
|
||||||
if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
|
dep = token.dep_
|
||||||
return None
|
tag = token.tag_
|
||||||
offset = token_annotation.heads[i] - i
|
offset = token.head.i - token.i
|
||||||
offset = min(offset, 2)
|
offset = min(offset, 2)
|
||||||
offset = max(offset, -2)
|
offset = max(offset, -2)
|
||||||
return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}"
|
return f"{dep}-{tag}:{offset}"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_ent_tag(i, token_annotation):
|
def make_ent_tag(token):
|
||||||
if token_annotation.entities is None or token_annotation.entities[i] is None:
|
if token.ent_iob_ == "O":
|
||||||
return None
|
ent = "O"
|
||||||
else:
|
else:
|
||||||
return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}"
|
ent = token.ent_iob_ + "-" + token.ent_type_
|
||||||
|
tag = token.tag_
|
||||||
|
return f"{tag}-{ent}"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_sent_start(target, token_annotation, cache=True, _cache={}):
|
def make_sent_start(token):
|
||||||
"""A multi-task objective for representing sentence boundaries,
|
"""A multi-task objective for representing sentence boundaries,
|
||||||
using BILU scheme. (O is impossible)
|
using BILU scheme. (O is impossible)
|
||||||
|
|
||||||
The implementation of this method uses an internal cache that relies
|
|
||||||
on the identity of the heads array, to avoid requiring a new piece
|
|
||||||
of gold data. You can pass cache=False if you know the cache will
|
|
||||||
do the wrong thing.
|
|
||||||
"""
|
"""
|
||||||
words = token_annotation.words
|
if token.is_sent_start and token.is_sent_end:
|
||||||
heads = token_annotation.heads
|
return "U-SENT"
|
||||||
assert len(words) == len(heads)
|
elif token.is_sent_start:
|
||||||
assert target < len(words), (target, len(words))
|
return "B-SENT"
|
||||||
if cache:
|
|
||||||
if id(heads) in _cache:
|
|
||||||
return _cache[id(heads)][target]
|
|
||||||
else:
|
else:
|
||||||
for key in list(_cache.keys()):
|
return "I-SENT"
|
||||||
_cache.pop(key)
|
|
||||||
sent_tags = ["I-SENT"] * len(words)
|
|
||||||
_cache[id(heads)] = sent_tags
|
|
||||||
else:
|
|
||||||
sent_tags = ["I-SENT"] * len(words)
|
|
||||||
|
|
||||||
def _find_root(child):
|
|
||||||
seen = set([child])
|
|
||||||
while child is not None and heads[child] != child:
|
|
||||||
seen.add(child)
|
|
||||||
child = heads[child]
|
|
||||||
return child
|
|
||||||
|
|
||||||
sentences = {}
|
|
||||||
for i in range(len(words)):
|
|
||||||
root = _find_root(i)
|
|
||||||
if root is None:
|
|
||||||
sent_tags[i] = None
|
|
||||||
else:
|
|
||||||
sentences.setdefault(root, []).append(i)
|
|
||||||
for root, span in sorted(sentences.items()):
|
|
||||||
if len(span) == 1:
|
|
||||||
sent_tags[span[0]] = "U-SENT"
|
|
||||||
else:
|
|
||||||
sent_tags[span[0]] = "B-SENT"
|
|
||||||
sent_tags[span[-1]] = "L-SENT"
|
|
||||||
return sent_tags[target]
|
|
||||||
|
|
||||||
|
|
||||||
class ClozeMultitask(Pipe):
|
class ClozeMultitask(Pipe):
|
||||||
|
@ -832,7 +769,7 @@ class ClozeMultitask(Pipe):
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||||
# and look them up all at once. This prevents data copying.
|
# and look them up all at once. This prevents data copying.
|
||||||
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
|
ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
|
||||||
target = vectors[ids]
|
target = vectors[ids]
|
||||||
gradient = self.distance.get_grad(prediction, target)
|
gradient = self.distance.get_grad(prediction, target)
|
||||||
loss = self.distance.get_loss(prediction, target)
|
loss = self.distance.get_loss(prediction, target)
|
||||||
|
@ -842,11 +779,12 @@ class ClozeMultitask(Pipe):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
examples = Example.to_example_objects(examples)
|
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples])
|
predictions, bp_predictions = self.model.begin_update(
|
||||||
|
[eg.predicted for eg in examples])
|
||||||
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
||||||
bp_predictions(d_predictions)
|
bp_predictions(d_predictions)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
|
@ -881,17 +819,10 @@ class TextCategorizer(Pipe):
|
||||||
def labels(self, value):
|
def labels(self, value):
|
||||||
self.cfg["labels"] = tuple(value)
|
self.cfg["labels"] = tuple(value)
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
scores, tensors = self.predict(docs)
|
scores, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
|
|
||||||
if as_example:
|
|
||||||
for ex, doc in zip(examples, docs):
|
|
||||||
ex.doc = doc
|
|
||||||
yield ex
|
|
||||||
else:
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -913,12 +844,15 @@ class TextCategorizer(Pipe):
|
||||||
doc.cats[label] = float(scores[i, j])
|
doc.cats[label] = float(scores[i, j])
|
||||||
|
|
||||||
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
|
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||||
examples = Example.to_example_objects(examples)
|
for eg in examples:
|
||||||
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
|
assert isinstance(eg, Example)
|
||||||
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
scores, bp_scores = self.model.begin_update([ex.doc for ex in examples])
|
scores, bp_scores = self.model.begin_update(
|
||||||
|
[eg.predicted for eg in examples]
|
||||||
|
)
|
||||||
loss, d_scores = self.get_loss(examples, scores)
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
bp_scores(d_scores)
|
bp_scores(d_scores)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
|
@ -927,14 +861,15 @@ class TextCategorizer(Pipe):
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [ex.doc for ex in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
self.set_annotations(docs, scores=scores)
|
self.set_annotations(docs, scores=scores)
|
||||||
|
|
||||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return
|
return
|
||||||
examples = Example.to_example_objects(examples)
|
for eg in examples:
|
||||||
docs=[ex.doc for ex in examples]
|
assert isinstance(eg, Example)
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
|
@ -950,13 +885,12 @@ class TextCategorizer(Pipe):
|
||||||
losses[self.name] += (gradient**2).sum()
|
losses[self.name] += (gradient**2).sum()
|
||||||
|
|
||||||
def _examples_to_truth(self, examples):
|
def _examples_to_truth(self, examples):
|
||||||
gold_cats = [ex.doc_annotation.cats for ex in examples]
|
truths = numpy.zeros((len(examples), len(self.labels)), dtype="f")
|
||||||
truths = numpy.zeros((len(gold_cats), len(self.labels)), dtype="f")
|
not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
|
||||||
not_missing = numpy.ones((len(gold_cats), len(self.labels)), dtype="f")
|
for i, eg in enumerate(examples):
|
||||||
for i, gold_cat in enumerate(gold_cats):
|
|
||||||
for j, label in enumerate(self.labels):
|
for j, label in enumerate(self.labels):
|
||||||
if label in gold_cat:
|
if label in eg.predicted.cats:
|
||||||
truths[i, j] = gold_cat[label]
|
truths[i, j] = eg.reference.cats[label]
|
||||||
else:
|
else:
|
||||||
not_missing[i, j] = 0.
|
not_missing[i, j] = 0.
|
||||||
truths = self.model.ops.asarray(truths)
|
truths = self.model.ops.asarray(truths)
|
||||||
|
@ -993,7 +927,7 @@ class TextCategorizer(Pipe):
|
||||||
# TODO: begin_training is not guaranteed to see all data / labels ?
|
# TODO: begin_training is not guaranteed to see all data / labels ?
|
||||||
examples = list(get_examples())
|
examples = list(get_examples())
|
||||||
for example in examples:
|
for example in examples:
|
||||||
for cat in example.doc_annotation.cats:
|
for cat in example.y.cats:
|
||||||
self.add_label(cat)
|
self.add_label(cat)
|
||||||
self.require_labels()
|
self.require_labels()
|
||||||
docs = [Doc(Vocab(), words=["hello"])]
|
docs = [Doc(Vocab(), words=["hello"])]
|
||||||
|
@ -1150,21 +1084,22 @@ class EntityLinker(Pipe):
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
if not examples:
|
if not examples:
|
||||||
return 0
|
return 0
|
||||||
examples = Example.to_example_objects(examples)
|
for eg in examples:
|
||||||
|
assert isinstance(eg, Example)
|
||||||
sentence_docs = []
|
sentence_docs = []
|
||||||
docs = [ex.doc for ex in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
# This seems simpler than other ways to get that exact output -- but
|
# This seems simpler than other ways to get that exact output -- but
|
||||||
# it does run the model twice :(
|
# it does run the model twice :(
|
||||||
predictions = self.model.predict(docs)
|
predictions = self.model.predict(docs)
|
||||||
golds = [ex.gold for ex in examples]
|
|
||||||
|
|
||||||
for doc, gold in zip(docs, golds):
|
for eg in examples:
|
||||||
|
doc = eg.predicted
|
||||||
ents_by_offset = dict()
|
ents_by_offset = dict()
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
ents_by_offset[(ent.start_char, ent.end_char)] = ent
|
ents_by_offset[(ent.start_char, ent.end_char)] = ent
|
||||||
|
links = self._get_links_from_doc(eg.reference)
|
||||||
for entity, kb_dict in gold.links.items():
|
for entity, kb_dict in links.items():
|
||||||
if isinstance(entity, str):
|
if isinstance(entity, str):
|
||||||
entity = literal_eval(entity)
|
entity = literal_eval(entity)
|
||||||
start, end = entity
|
start, end = entity
|
||||||
|
@ -1185,7 +1120,10 @@ class EntityLinker(Pipe):
|
||||||
raise RuntimeError(Errors.E030)
|
raise RuntimeError(Errors.E030)
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
|
||||||
loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds)
|
loss, d_scores = self.get_similarity_loss(
|
||||||
|
scores=sentence_encodings,
|
||||||
|
examples=examples
|
||||||
|
)
|
||||||
bp_context(d_scores)
|
bp_context(d_scores)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.model.finish_update(sgd)
|
self.model.finish_update(sgd)
|
||||||
|
@ -1196,10 +1134,11 @@ class EntityLinker(Pipe):
|
||||||
self.set_annotations(docs, predictions)
|
self.set_annotations(docs, predictions)
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def get_similarity_loss(self, golds, scores):
|
def get_similarity_loss(self, examples, scores):
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
for gold in golds:
|
for eg in examples:
|
||||||
for entity, kb_dict in gold.links.items():
|
links = self._get_links_from_doc(eg.reference)
|
||||||
|
for entity, kb_dict in links.items():
|
||||||
for kb_id, value in kb_dict.items():
|
for kb_id, value in kb_dict.items():
|
||||||
# this loss function assumes we're only using positive examples
|
# this loss function assumes we're only using positive examples
|
||||||
if value:
|
if value:
|
||||||
|
@ -1218,8 +1157,9 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
cats = []
|
cats = []
|
||||||
for ex in examples:
|
for eg in examples:
|
||||||
for entity, kb_dict in ex.gold.links.items():
|
links = self._get_links_from_doc(eg.reference)
|
||||||
|
for entity, kb_dict in links.items():
|
||||||
for kb_id, value in kb_dict.items():
|
for kb_id, value in kb_dict.items():
|
||||||
cats.append([value])
|
cats.append([value])
|
||||||
|
|
||||||
|
@ -1232,26 +1172,18 @@ class EntityLinker(Pipe):
|
||||||
loss = loss / len(cats)
|
loss = loss / len(cats)
|
||||||
return loss, d_scores
|
return loss, d_scores
|
||||||
|
|
||||||
def __call__(self, example):
|
def _get_links_from_doc(self, doc):
|
||||||
doc = self._get_doc(example)
|
return {}
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
kb_ids, tensors = self.predict([doc])
|
kb_ids, tensors = self.predict([doc])
|
||||||
self.set_annotations([doc], kb_ids, tensors=tensors)
|
self.set_annotations([doc], kb_ids, tensors=tensors)
|
||||||
if isinstance(example, Example):
|
|
||||||
example.doc = doc
|
|
||||||
return example
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
kb_ids, tensors = self.predict(docs)
|
kb_ids, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, kb_ids, tensors=tensors)
|
self.set_annotations(docs, kb_ids, tensors=tensors)
|
||||||
|
|
||||||
if as_example:
|
|
||||||
for ex, doc in zip(examples, docs):
|
|
||||||
ex.doc = doc
|
|
||||||
yield ex
|
|
||||||
else:
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -1428,7 +1360,7 @@ class Sentencizer(Pipe):
|
||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def __call__(self, example):
|
def __call__(self, doc):
|
||||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||||
|
|
||||||
example (Doc or Example): The document to process.
|
example (Doc or Example): The document to process.
|
||||||
|
@ -1436,7 +1368,6 @@ class Sentencizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#call
|
DOCS: https://spacy.io/api/sentencizer#call
|
||||||
"""
|
"""
|
||||||
doc = self._get_doc(example)
|
|
||||||
start = 0
|
start = 0
|
||||||
seen_period = False
|
seen_period = False
|
||||||
for i, token in enumerate(doc):
|
for i, token in enumerate(doc):
|
||||||
|
@ -1450,25 +1381,16 @@ class Sentencizer(Pipe):
|
||||||
seen_period = True
|
seen_period = True
|
||||||
if start < len(doc):
|
if start < len(doc):
|
||||||
doc[start].is_sent_start = True
|
doc[start].is_sent_start = True
|
||||||
if isinstance(example, Example):
|
|
||||||
example.doc = doc
|
|
||||||
return example
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
predictions = self.predict(docs)
|
predictions = self.predict(docs)
|
||||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||||
scores, tensors = predictions
|
scores, tensors = predictions
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
else:
|
else:
|
||||||
self.set_annotations(docs, predictions)
|
self.set_annotations(docs, predictions)
|
||||||
if as_example:
|
|
||||||
for ex, doc in zip(examples, docs):
|
|
||||||
ex.doc = doc
|
|
||||||
yield ex
|
|
||||||
else:
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
|
|
@ -286,7 +286,7 @@ class Scorer(object):
|
||||||
if isinstance(example, tuple) and len(example) == 2:
|
if isinstance(example, tuple) and len(example) == 2:
|
||||||
doc, gold = example
|
doc, gold = example
|
||||||
else:
|
else:
|
||||||
gold = example.gold
|
gold = example._deprecated_get_gold()
|
||||||
doc = example.doc
|
doc = example.doc
|
||||||
|
|
||||||
if len(doc) != len(gold):
|
if len(doc) != len(gold):
|
||||||
|
|
|
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ..typedefs cimport weight_t, attr_t
|
from ..typedefs cimport weight_t, attr_t
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
from ..gold cimport GoldParseC
|
from .gold_parse cimport GoldParseC
|
||||||
|
|
||||||
|
|
||||||
cdef class ArcEager(TransitionSystem):
|
cdef class ArcEager(TransitionSystem):
|
||||||
|
|
39
spacy/syntax/gold_parse.pxd
Normal file
39
spacy/syntax/gold_parse.pxd
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
from .transition_system cimport Transition
|
||||||
|
from ..typedefs cimport attr_t
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct GoldParseC:
|
||||||
|
int* tags
|
||||||
|
int* heads
|
||||||
|
int* has_dep
|
||||||
|
int* sent_start
|
||||||
|
attr_t* labels
|
||||||
|
int** brackets
|
||||||
|
Transition* ner
|
||||||
|
|
||||||
|
|
||||||
|
cdef class GoldParse:
|
||||||
|
cdef Pool mem
|
||||||
|
|
||||||
|
cdef GoldParseC c
|
||||||
|
cdef readonly object orig
|
||||||
|
|
||||||
|
cdef int length
|
||||||
|
cdef public int loss
|
||||||
|
cdef public list words
|
||||||
|
cdef public list tags
|
||||||
|
cdef public list pos
|
||||||
|
cdef public list morphs
|
||||||
|
cdef public list lemmas
|
||||||
|
cdef public list sent_starts
|
||||||
|
cdef public list heads
|
||||||
|
cdef public list labels
|
||||||
|
cdef public dict orths
|
||||||
|
cdef public list ner
|
||||||
|
cdef public dict brackets
|
||||||
|
cdef public dict cats
|
||||||
|
cdef public dict links
|
||||||
|
|
||||||
|
cdef readonly list cand_to_gold
|
||||||
|
cdef readonly list gold_to_cand
|
346
spacy/syntax/gold_parse.pyx
Normal file
346
spacy/syntax/gold_parse.pyx
Normal file
|
@ -0,0 +1,346 @@
|
||||||
|
# cython: profile=True
|
||||||
|
import re
|
||||||
|
import random
|
||||||
|
import numpy
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
import itertools
|
||||||
|
from pathlib import Path
|
||||||
|
import srsly
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
from .. import util
|
||||||
|
from . import nonproj
|
||||||
|
from ..tokens import Doc, Span
|
||||||
|
from ..errors import Errors, AlignmentError, Warnings
|
||||||
|
from ..gold.annotation import TokenAnnotation
|
||||||
|
from ..gold.iob_utils import offsets_from_biluo_tags, biluo_tags_from_offsets
|
||||||
|
from ..gold.align import align
|
||||||
|
|
||||||
|
|
||||||
|
punct_re = re.compile(r"\W")
|
||||||
|
|
||||||
|
def is_punct_label(label):
|
||||||
|
return label == "P" or label.lower() == "punct"
|
||||||
|
|
||||||
|
|
||||||
|
def get_parses_from_example(
|
||||||
|
example, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
|
||||||
|
):
|
||||||
|
"""Return a list of (doc, GoldParse) objects.
|
||||||
|
If merge is set to True, keep all Token annotations as one big list."""
|
||||||
|
# merge == do not modify Example
|
||||||
|
if merge:
|
||||||
|
examples = [example]
|
||||||
|
else:
|
||||||
|
# not merging: one GoldParse per sentence, defining docs with the words
|
||||||
|
# from each sentence
|
||||||
|
examples = example.split_sents()
|
||||||
|
outputs = []
|
||||||
|
for eg in examples:
|
||||||
|
eg_dict = eg.to_dict()
|
||||||
|
try:
|
||||||
|
gp = GoldParse.from_annotation(
|
||||||
|
eg.predicted,
|
||||||
|
eg_dict["doc_annotation"],
|
||||||
|
eg_dict["token_annotation"],
|
||||||
|
make_projective=make_projective
|
||||||
|
)
|
||||||
|
except AlignmentError:
|
||||||
|
if ignore_misaligned:
|
||||||
|
gp = None
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
outputs.append((eg.predicted, gp))
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
|
cdef class GoldParse:
|
||||||
|
"""Collection for training annotations.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/goldparse
|
||||||
|
"""
|
||||||
|
@classmethod
|
||||||
|
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
|
||||||
|
return cls(
|
||||||
|
doc,
|
||||||
|
words=token_annotation["words"],
|
||||||
|
tags=token_annotation["tags"],
|
||||||
|
pos=token_annotation["pos"],
|
||||||
|
morphs=token_annotation["morphs"],
|
||||||
|
lemmas=token_annotation["lemmas"],
|
||||||
|
heads=token_annotation["heads"],
|
||||||
|
deps=token_annotation["deps"],
|
||||||
|
entities=token_annotation["entities"],
|
||||||
|
sent_starts=token_annotation["sent_starts"],
|
||||||
|
cats=doc_annotation["cats"],
|
||||||
|
links=doc_annotation["links"],
|
||||||
|
make_projective=make_projective
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_token_annotation(self):
|
||||||
|
ids = None
|
||||||
|
if self.words:
|
||||||
|
ids = list(range(len(self.words)))
|
||||||
|
|
||||||
|
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
|
||||||
|
pos=self.pos, morphs=self.morphs,
|
||||||
|
lemmas=self.lemmas, heads=self.heads,
|
||||||
|
deps=self.labels, entities=self.ner,
|
||||||
|
sent_starts=self.sent_starts)
|
||||||
|
|
||||||
|
def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
|
||||||
|
lemmas=None, heads=None, deps=None, entities=None,
|
||||||
|
sent_starts=None, make_projective=False, cats=None,
|
||||||
|
links=None):
|
||||||
|
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
|
||||||
|
|
||||||
|
doc (Doc): The document the annotations refer to.
|
||||||
|
words (iterable): A sequence of unicode word strings.
|
||||||
|
tags (iterable): A sequence of strings, representing tag annotations.
|
||||||
|
pos (iterable): A sequence of strings, representing UPOS annotations.
|
||||||
|
morphs (iterable): A sequence of strings, representing morph
|
||||||
|
annotations.
|
||||||
|
lemmas (iterable): A sequence of strings, representing lemma
|
||||||
|
annotations.
|
||||||
|
heads (iterable): A sequence of integers, representing syntactic
|
||||||
|
head offsets.
|
||||||
|
deps (iterable): A sequence of strings, representing the syntactic
|
||||||
|
relation types.
|
||||||
|
entities (iterable): A sequence of named entity annotations, either as
|
||||||
|
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
||||||
|
representing the entity positions.
|
||||||
|
sent_starts (iterable): A sequence of sentence position tags, 1 for
|
||||||
|
the first word in a sentence, 0 for all others.
|
||||||
|
cats (dict): Labels for text classification. Each key in the dictionary
|
||||||
|
may be a string or an int, or a `(start_char, end_char, label)`
|
||||||
|
tuple, indicating that the label is applied to only part of the
|
||||||
|
document (usually a sentence). Unlike entity annotations, label
|
||||||
|
annotations can overlap, i.e. a single word can be covered by
|
||||||
|
multiple labelled spans. The TextCategorizer component expects
|
||||||
|
true examples of a label to have the value 1.0, and negative
|
||||||
|
examples of a label to have the value 0.0. Labels not in the
|
||||||
|
dictionary are treated as missing - the gradient for those labels
|
||||||
|
will be zero.
|
||||||
|
links (dict): A dict with `(start_char, end_char)` keys,
|
||||||
|
and the values being dicts with kb_id:value entries,
|
||||||
|
representing the external IDs in a knowledge base (KB)
|
||||||
|
mapped to either 1.0 or 0.0, indicating positive and
|
||||||
|
negative examples respectively.
|
||||||
|
RETURNS (GoldParse): The newly constructed object.
|
||||||
|
"""
|
||||||
|
self.mem = Pool()
|
||||||
|
self.loss = 0
|
||||||
|
self.length = len(doc)
|
||||||
|
|
||||||
|
self.cats = {} if cats is None else dict(cats)
|
||||||
|
self.links = {} if links is None else dict(links)
|
||||||
|
|
||||||
|
# temporary doc for aligning entity annotation
|
||||||
|
entdoc = None
|
||||||
|
|
||||||
|
# avoid allocating memory if the doc does not contain any tokens
|
||||||
|
if self.length == 0:
|
||||||
|
self.words = []
|
||||||
|
self.tags = []
|
||||||
|
self.heads = []
|
||||||
|
self.labels = []
|
||||||
|
self.ner = []
|
||||||
|
self.morphs = []
|
||||||
|
# set a minimal orig so that the scorer can score an empty doc
|
||||||
|
self.orig = TokenAnnotation(ids=[])
|
||||||
|
else:
|
||||||
|
if not words:
|
||||||
|
words = [token.text for token in doc]
|
||||||
|
if not tags:
|
||||||
|
tags = [None for _ in words]
|
||||||
|
if not pos:
|
||||||
|
pos = [None for _ in words]
|
||||||
|
if not morphs:
|
||||||
|
morphs = [None for _ in words]
|
||||||
|
if not lemmas:
|
||||||
|
lemmas = [None for _ in words]
|
||||||
|
if not heads:
|
||||||
|
heads = [None for _ in words]
|
||||||
|
if not deps:
|
||||||
|
deps = [None for _ in words]
|
||||||
|
if not sent_starts:
|
||||||
|
sent_starts = [None for _ in words]
|
||||||
|
if entities is None:
|
||||||
|
entities = ["-" for _ in words]
|
||||||
|
elif len(entities) == 0:
|
||||||
|
entities = ["O" for _ in words]
|
||||||
|
else:
|
||||||
|
# Translate the None values to '-', to make processing easier.
|
||||||
|
# See Issue #2603
|
||||||
|
entities = [(ent if ent is not None else "-") for ent in entities]
|
||||||
|
if not isinstance(entities[0], str):
|
||||||
|
# Assume we have entities specified by character offset.
|
||||||
|
# Create a temporary Doc corresponding to provided words
|
||||||
|
# (to preserve gold tokenization) and text (to preserve
|
||||||
|
# character offsets).
|
||||||
|
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
|
||||||
|
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
|
||||||
|
entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
|
||||||
|
# There may be some additional whitespace tokens in the
|
||||||
|
# temporary doc, so check that the annotations align with
|
||||||
|
# the provided words while building a list of BILUO labels.
|
||||||
|
entities = []
|
||||||
|
words_offset = 0
|
||||||
|
for i in range(len(entdoc_words)):
|
||||||
|
if words[i + words_offset] == entdoc_words[i]:
|
||||||
|
entities.append(entdoc_entities[i])
|
||||||
|
else:
|
||||||
|
words_offset -= 1
|
||||||
|
if len(entities) != len(words):
|
||||||
|
warnings.warn(Warnings.W029.format(text=doc.text))
|
||||||
|
entities = ["-" for _ in words]
|
||||||
|
|
||||||
|
# These are filled by the tagger/parser/entity recogniser
|
||||||
|
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
||||||
|
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
|
||||||
|
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
||||||
|
|
||||||
|
self.words = [None] * len(doc)
|
||||||
|
self.tags = [None] * len(doc)
|
||||||
|
self.pos = [None] * len(doc)
|
||||||
|
self.morphs = [None] * len(doc)
|
||||||
|
self.lemmas = [None] * len(doc)
|
||||||
|
self.heads = [None] * len(doc)
|
||||||
|
self.labels = [None] * len(doc)
|
||||||
|
self.ner = [None] * len(doc)
|
||||||
|
self.sent_starts = [None] * len(doc)
|
||||||
|
|
||||||
|
# This needs to be done before we align the words
|
||||||
|
if make_projective and any(heads) and any(deps) :
|
||||||
|
heads, deps = nonproj.projectivize(heads, deps)
|
||||||
|
|
||||||
|
# Do many-to-one alignment for misaligned tokens.
|
||||||
|
# If we over-segment, we'll have one gold word that covers a sequence
|
||||||
|
# of predicted words
|
||||||
|
# If we under-segment, we'll have one predicted word that covers a
|
||||||
|
# sequence of gold words.
|
||||||
|
# If we "mis-segment", we'll have a sequence of predicted words covering
|
||||||
|
# a sequence of gold words. That's many-to-many -- we don't do that
|
||||||
|
# except for NER spans where the start and end can be aligned.
|
||||||
|
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
||||||
|
|
||||||
|
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
||||||
|
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
||||||
|
|
||||||
|
self.orig = TokenAnnotation(ids=list(range(len(words))),
|
||||||
|
words=words, tags=tags, pos=pos, morphs=morphs,
|
||||||
|
lemmas=lemmas, heads=heads, deps=deps, entities=entities,
|
||||||
|
sent_starts=sent_starts, brackets=[])
|
||||||
|
|
||||||
|
for i, gold_i in enumerate(self.cand_to_gold):
|
||||||
|
if doc[i].text.isspace():
|
||||||
|
self.words[i] = doc[i].text
|
||||||
|
self.tags[i] = "_SP"
|
||||||
|
self.pos[i] = "SPACE"
|
||||||
|
self.morphs[i] = None
|
||||||
|
self.lemmas[i] = None
|
||||||
|
self.heads[i] = None
|
||||||
|
self.labels[i] = None
|
||||||
|
self.ner[i] = None
|
||||||
|
self.sent_starts[i] = 0
|
||||||
|
if gold_i is None:
|
||||||
|
if i in i2j_multi:
|
||||||
|
self.words[i] = words[i2j_multi[i]]
|
||||||
|
self.tags[i] = tags[i2j_multi[i]]
|
||||||
|
self.pos[i] = pos[i2j_multi[i]]
|
||||||
|
self.morphs[i] = morphs[i2j_multi[i]]
|
||||||
|
self.lemmas[i] = lemmas[i2j_multi[i]]
|
||||||
|
self.sent_starts[i] = sent_starts[i2j_multi[i]]
|
||||||
|
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
||||||
|
# Set next word in multi-token span as head, until last
|
||||||
|
if not is_last:
|
||||||
|
self.heads[i] = i+1
|
||||||
|
self.labels[i] = "subtok"
|
||||||
|
else:
|
||||||
|
head_i = heads[i2j_multi[i]]
|
||||||
|
if head_i:
|
||||||
|
self.heads[i] = self.gold_to_cand[head_i]
|
||||||
|
self.labels[i] = deps[i2j_multi[i]]
|
||||||
|
ner_tag = entities[i2j_multi[i]]
|
||||||
|
# Assign O/- for many-to-one O/- NER tags
|
||||||
|
if ner_tag in ("O", "-"):
|
||||||
|
self.ner[i] = ner_tag
|
||||||
|
else:
|
||||||
|
self.words[i] = words[gold_i]
|
||||||
|
self.tags[i] = tags[gold_i]
|
||||||
|
self.pos[i] = pos[gold_i]
|
||||||
|
self.morphs[i] = morphs[gold_i]
|
||||||
|
self.lemmas[i] = lemmas[gold_i]
|
||||||
|
self.sent_starts[i] = sent_starts[gold_i]
|
||||||
|
if heads[gold_i] is None:
|
||||||
|
self.heads[i] = None
|
||||||
|
else:
|
||||||
|
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
||||||
|
self.labels[i] = deps[gold_i]
|
||||||
|
self.ner[i] = entities[gold_i]
|
||||||
|
# Assign O/- for one-to-many O/- NER tags
|
||||||
|
for j, cand_j in enumerate(self.gold_to_cand):
|
||||||
|
if cand_j is None:
|
||||||
|
if j in j2i_multi:
|
||||||
|
i = j2i_multi[j]
|
||||||
|
ner_tag = entities[j]
|
||||||
|
if ner_tag in ("O", "-"):
|
||||||
|
self.ner[i] = ner_tag
|
||||||
|
|
||||||
|
# If there is entity annotation and some tokens remain unaligned,
|
||||||
|
# align all entities at the character level to account for all
|
||||||
|
# possible token misalignments within the entity spans
|
||||||
|
if any([e not in ("O", "-") for e in entities]) and None in self.ner:
|
||||||
|
# If the temporary entdoc wasn't created above, initialize it
|
||||||
|
if not entdoc:
|
||||||
|
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
|
||||||
|
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
|
||||||
|
# Get offsets based on gold words and BILUO entities
|
||||||
|
entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
|
||||||
|
aligned_offsets = []
|
||||||
|
aligned_spans = []
|
||||||
|
# Filter offsets to identify those that align with doc tokens
|
||||||
|
for offset in entdoc_offsets:
|
||||||
|
span = doc.char_span(offset[0], offset[1])
|
||||||
|
if span and not span.text.isspace():
|
||||||
|
aligned_offsets.append(offset)
|
||||||
|
aligned_spans.append(span)
|
||||||
|
# Convert back to BILUO for doc tokens and assign NER for all
|
||||||
|
# aligned spans
|
||||||
|
biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
|
||||||
|
for span in aligned_spans:
|
||||||
|
for i in range(span.start, span.end):
|
||||||
|
self.ner[i] = biluo_tags[i]
|
||||||
|
|
||||||
|
# Prevent whitespace that isn't within entities from being tagged as
|
||||||
|
# an entity.
|
||||||
|
for i in range(len(self.ner)):
|
||||||
|
if self.tags[i] == "_SP":
|
||||||
|
prev_ner = self.ner[i-1] if i >= 1 else None
|
||||||
|
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
|
||||||
|
if prev_ner == "O" or next_ner == "O":
|
||||||
|
self.ner[i] = "O"
|
||||||
|
|
||||||
|
cycle = nonproj.contains_cycle(self.heads)
|
||||||
|
if cycle is not None:
|
||||||
|
raise ValueError(Errors.E069.format(cycle=cycle,
|
||||||
|
cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
|
||||||
|
doc_tokens=" ".join(words[:50])))
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
"""Get the number of gold-standard tokens.
|
||||||
|
|
||||||
|
RETURNS (int): The number of gold-standard tokens.
|
||||||
|
"""
|
||||||
|
return self.length
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_projective(self):
|
||||||
|
"""Whether the provided syntactic annotations form a projective
|
||||||
|
dependency tree.
|
||||||
|
"""
|
||||||
|
return not nonproj.is_nonproj_tree(self.heads)
|
|
@ -515,8 +515,8 @@ cdef class Parser:
|
||||||
good_golds = []
|
good_golds = []
|
||||||
good_states = []
|
good_states = []
|
||||||
for i, eg in enumerate(whole_examples):
|
for i, eg in enumerate(whole_examples):
|
||||||
doc = eg.doc
|
parses = get_parses_from_example(eg)
|
||||||
gold = self.moves.preprocess_gold(eg.gold)
|
doc, gold = parses[0]
|
||||||
if gold is not None and self.moves.has_gold(gold):
|
if gold is not None and self.moves.has_gold(gold):
|
||||||
good_docs.append(doc)
|
good_docs.append(doc)
|
||||||
good_golds.append(gold)
|
good_golds.append(gold)
|
||||||
|
@ -535,8 +535,12 @@ cdef class Parser:
|
||||||
cdef:
|
cdef:
|
||||||
StateClass state
|
StateClass state
|
||||||
Transition action
|
Transition action
|
||||||
whole_docs = [ex.doc for ex in whole_examples]
|
whole_docs = []
|
||||||
whole_golds = [ex.gold for ex in whole_examples]
|
whole_golds = []
|
||||||
|
for eg in whole_examples:
|
||||||
|
for doc, gold in get_parses_from_example(eg):
|
||||||
|
whole_docs.append(doc)
|
||||||
|
whole_golds.append(gold)
|
||||||
whole_states = self.moves.init_batch(whole_docs)
|
whole_states = self.moves.init_batch(whole_docs)
|
||||||
max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
|
max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
|
||||||
max_moves = 0
|
max_moves = 0
|
||||||
|
@ -625,7 +629,7 @@ cdef class Parser:
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
gold_sample = []
|
gold_sample = []
|
||||||
for example in islice(get_examples(), 10):
|
for example in islice(get_examples(), 10):
|
||||||
parses = example.get_gold_parses(merge=False, vocab=self.vocab)
|
parses = get_parses_from_example(example, merge=False, vocab=self.vocab)
|
||||||
for doc, gold in parses:
|
for doc, gold in parses:
|
||||||
if len(doc):
|
if len(doc):
|
||||||
doc_sample.append(doc)
|
doc_sample.append(doc)
|
||||||
|
|
|
@ -7,7 +7,7 @@ from copy import copy
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc, set_children_from_heads
|
from ..tokens.doc cimport Doc, set_children_from_heads
|
||||||
|
|
||||||
from ..gold import Example
|
from ..gold import Example, TokenAnnotation
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
@ -108,7 +108,7 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30):
|
||||||
proj_token_dict = example.token_annotation.to_dict()
|
proj_token_dict = example.token_annotation.to_dict()
|
||||||
proj_token_dict["heads"] = proj_heads
|
proj_token_dict["heads"] = proj_heads
|
||||||
proj_token_dict["deps"] = deco_deps
|
proj_token_dict["deps"] = deco_deps
|
||||||
new_example.set_token_annotation(**proj_token_dict)
|
new_example.token_annotation = TokenAnnotation(**proj_token_dict)
|
||||||
preprocessed.append(new_example)
|
preprocessed.append(new_example)
|
||||||
if label_freq_cutoff > 0:
|
if label_freq_cutoff > 0:
|
||||||
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
|
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
|
||||||
|
@ -216,6 +216,6 @@ def _filter_labels(examples, cutoff, freqs):
|
||||||
filtered_labels.append(label)
|
filtered_labels.append(label)
|
||||||
filtered_token_dict = example.token_annotation.to_dict()
|
filtered_token_dict = example.token_annotation.to_dict()
|
||||||
filtered_token_dict["deps"] = filtered_labels
|
filtered_token_dict["deps"] = filtered_labels
|
||||||
new_example.set_token_annotation(**filtered_token_dict)
|
new_example.token_annotation = TokenAnnotation(**filtered_token_dict)
|
||||||
filtered.append(new_example)
|
filtered.append(new_example)
|
||||||
return filtered
|
return filtered
|
||||||
|
|
|
@ -35,7 +35,10 @@ def _train_parser(parser):
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
losses = {}
|
losses = {}
|
||||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
|
gold = {
|
||||||
|
"heads": [1, 1, 3, 3],
|
||||||
|
"deps": ["left", "ROOT", "left", "ROOT"]
|
||||||
|
}
|
||||||
parser.update((doc, gold), sgd=sgd, losses=losses)
|
parser.update((doc, gold), sgd=sgd, losses=losses)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
@ -47,9 +50,10 @@ def test_add_label(parser):
|
||||||
for i in range(100):
|
for i in range(100):
|
||||||
losses = {}
|
losses = {}
|
||||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
gold = GoldParse(
|
gold = {
|
||||||
doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"]
|
"heads": [1, 1, 3, 3],
|
||||||
)
|
"deps": ["right", "ROOT", "left", "ROOT"]
|
||||||
|
}
|
||||||
parser.update((doc, gold), sgd=sgd, losses=losses)
|
parser.update((doc, gold), sgd=sgd, losses=losses)
|
||||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
|
|
|
@ -47,7 +47,7 @@ def doc(vocab):
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def gold(doc):
|
def gold(doc):
|
||||||
return GoldParse(doc, heads=[1, 1, 1], deps=["L", "ROOT", "R"])
|
return {"heads": [1, 1, 1], "deps": ["L", "ROOT", "R"]}
|
||||||
|
|
||||||
|
|
||||||
def test_can_init_nn_parser(parser):
|
def test_can_init_nn_parser(parser):
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from thinc.api import Adam
|
from thinc.api import Adam
|
||||||
from spacy.attrs import NORM
|
from spacy.attrs import NORM
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
from spacy.pipeline.defaults import default_parser
|
from spacy.pipeline.defaults import default_parser
|
||||||
|
@ -28,7 +27,7 @@ def parser(vocab):
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
losses = {}
|
losses = {}
|
||||||
doc = Doc(vocab, words=["a", "b", "c", "d"])
|
doc = Doc(vocab, words=["a", "b", "c", "d"])
|
||||||
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
|
gold = dict(heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
|
||||||
parser.update((doc, gold), sgd=sgd, losses=losses)
|
parser.update((doc, gold), sgd=sgd, losses=losses)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ import gc
|
||||||
import numpy
|
import numpy
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
from spacy.gold import Example
|
from spacy.gold import Example, TokenAnnotation
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.en.stop_words import STOP_WORDS
|
from spacy.lang.en.stop_words import STOP_WORDS
|
||||||
from spacy.lang.lex_attrs import is_stop
|
from spacy.lang.lex_attrs import is_stop
|
||||||
|
@ -272,9 +272,16 @@ def test_issue1963(en_tokenizer):
|
||||||
def test_issue1967(label):
|
def test_issue1967(label):
|
||||||
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
|
||||||
ner = EntityRecognizer(Vocab(), default_ner(), **config)
|
ner = EntityRecognizer(Vocab(), default_ner(), **config)
|
||||||
example = Example(doc=None)
|
example = Example(
|
||||||
example.set_token_annotation(
|
doc=Doc(ner.vocab, words=["word"]),
|
||||||
ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
|
token_annotation=TokenAnnotation(
|
||||||
|
ids=[0],
|
||||||
|
words=["word"],
|
||||||
|
tags=["tag"],
|
||||||
|
heads=[0],
|
||||||
|
deps=["dep"],
|
||||||
|
entities=[label]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
ner.moves.get_actions(gold_parses=[example])
|
ner.moves.get_actions(gold_parses=[example])
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
from spacy.errors import AlignmentError
|
from spacy.errors import AlignmentError
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
||||||
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo, align
|
from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
|
||||||
from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation
|
from spacy.gold import GoldCorpus, docs_to_json, DocAnnotation
|
||||||
|
from spacy.gold.new_example import NewExample as Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.syntax.nonproj import is_nonproj_tree
|
from spacy.syntax.nonproj import is_nonproj_tree
|
||||||
|
from spacy.syntax.gold_parse import GoldParse, get_parses_from_example
|
||||||
|
from spacy.syntax.gold_parse import get_parses_from_example
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.util import get_words_and_spaces, compounding, minibatch
|
from spacy.util import get_words_and_spaces, compounding, minibatch
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -90,10 +93,16 @@ def merged_dict():
|
||||||
"ids": [1, 2, 3, 4, 5, 6, 7],
|
"ids": [1, 2, 3, 4, 5, 6, 7],
|
||||||
"words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
|
"words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
|
||||||
"tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
|
"tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
|
||||||
"sent_starts": [1, 0, 0, 1, 0, 0, 0, 0],
|
"sent_starts": [1, 0, 0, 1, 0, 0, 0],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def vocab():
|
||||||
|
nlp = English()
|
||||||
|
return nlp.vocab
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_U(en_vocab):
|
def test_gold_biluo_U(en_vocab):
|
||||||
words = ["I", "flew", "to", "London", "."]
|
words = ["I", "flew", "to", "London", "."]
|
||||||
spaces = [True, True, True, False, True]
|
spaces = [True, True, True, False, True]
|
||||||
|
@ -270,88 +279,38 @@ def test_roundtrip_docs_to_json(doc):
|
||||||
srsly.write_json(json_file, [docs_to_json(doc)])
|
srsly.write_json(json_file, [docs_to_json(doc)])
|
||||||
goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
|
goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
|
||||||
|
|
||||||
reloaded_example = next(goldcorpus.dev_dataset(nlp))
|
reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
|
||||||
goldparse = reloaded_example.gold
|
|
||||||
|
|
||||||
assert len(doc) == goldcorpus.count_train()
|
assert len(doc) == goldcorpus.count_train()
|
||||||
assert text == reloaded_example.text
|
assert text == reloaded_example.predicted.text
|
||||||
assert tags == goldparse.tags
|
assert tags == [t.tag_ for t in reloaded_example.reference]
|
||||||
assert pos == goldparse.pos
|
assert pos == [t.pos_ for t in reloaded_example.reference]
|
||||||
assert morphs == goldparse.morphs
|
assert morphs == [t.morph_ for t in reloaded_example.reference]
|
||||||
assert lemmas == goldparse.lemmas
|
assert lemmas == [t.lemma_ for t in reloaded_example.reference]
|
||||||
assert deps == goldparse.labels
|
assert deps == [t.dep_ for t in reloaded_example.reference]
|
||||||
assert heads == goldparse.heads
|
assert heads == [t.head.i for t in reloaded_example.reference]
|
||||||
assert biluo_tags == goldparse.ner
|
assert "TRAVEL" in reloaded_example.reference.cats
|
||||||
assert "TRAVEL" in goldparse.cats
|
assert "BAKING" in reloaded_example.reference.cats
|
||||||
assert "BAKING" in goldparse.cats
|
assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
|
||||||
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
|
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
|
||||||
assert cats["BAKING"] == goldparse.cats["BAKING"]
|
|
||||||
|
|
||||||
# roundtrip to JSONL train dicts
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
jsonl_file = tmpdir / "roundtrip.jsonl"
|
|
||||||
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
|
||||||
|
|
||||||
reloaded_example = next(goldcorpus.dev_dataset(nlp))
|
|
||||||
goldparse = reloaded_example.gold
|
|
||||||
|
|
||||||
assert len(doc) == goldcorpus.count_train()
|
|
||||||
assert text == reloaded_example.text
|
|
||||||
assert tags == goldparse.tags
|
|
||||||
assert pos == goldparse.pos
|
|
||||||
assert morphs == goldparse.morphs
|
|
||||||
assert lemmas == goldparse.lemmas
|
|
||||||
assert deps == goldparse.labels
|
|
||||||
assert heads == goldparse.heads
|
|
||||||
assert biluo_tags == goldparse.ner
|
|
||||||
assert "TRAVEL" in goldparse.cats
|
|
||||||
assert "BAKING" in goldparse.cats
|
|
||||||
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
|
|
||||||
assert cats["BAKING"] == goldparse.cats["BAKING"]
|
|
||||||
|
|
||||||
# roundtrip to JSONL tuples
|
|
||||||
with make_tempdir() as tmpdir:
|
|
||||||
jsonl_file = tmpdir / "roundtrip.jsonl"
|
|
||||||
# write to JSONL train dicts
|
|
||||||
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
|
||||||
# load and rewrite as JSONL tuples
|
|
||||||
srsly.write_jsonl(jsonl_file, goldcorpus.train_examples)
|
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
|
||||||
|
|
||||||
reloaded_example = next(goldcorpus.dev_dataset(nlp))
|
|
||||||
goldparse = reloaded_example.gold
|
|
||||||
|
|
||||||
assert len(doc) == goldcorpus.count_train()
|
|
||||||
assert text == reloaded_example.text
|
|
||||||
assert tags == goldparse.tags
|
|
||||||
assert deps == goldparse.labels
|
|
||||||
assert heads == goldparse.heads
|
|
||||||
assert lemmas == goldparse.lemmas
|
|
||||||
assert biluo_tags == goldparse.ner
|
|
||||||
assert "TRAVEL" in goldparse.cats
|
|
||||||
assert "BAKING" in goldparse.cats
|
|
||||||
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
|
|
||||||
assert cats["BAKING"] == goldparse.cats["BAKING"]
|
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail # TODO do we need to do the projectivity differently?
|
||||||
def test_projective_train_vs_nonprojective_dev(doc):
|
def test_projective_train_vs_nonprojective_dev(doc):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
deps = [t.dep_ for t in doc]
|
deps = [t.dep_ for t in doc]
|
||||||
heads = [t.head.i for t in doc]
|
heads = [t.head.i for t in doc]
|
||||||
|
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
jsonl_file = tmpdir / "test.jsonl"
|
json_file = tmpdir / "test.json"
|
||||||
# write to JSONL train dicts
|
# write to JSON train dicts
|
||||||
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
srsly.write_json(json_file, [docs_to_json(doc)])
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
goldcorpus = GoldCorpus(str(json_file), str(json_file))
|
||||||
|
|
||||||
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
|
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
|
||||||
train_goldparse = train_reloaded_example.gold
|
train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
|
||||||
|
|
||||||
dev_reloaded_example = next(goldcorpus.dev_dataset(nlp))
|
dev_reloaded_example = next(goldcorpus.dev_dataset(nlp))
|
||||||
dev_goldparse = dev_reloaded_example.gold
|
dev_goldparse = get_parses_from_example(dev_reloaded_example)[0][1]
|
||||||
|
|
||||||
assert is_nonproj_tree([t.head.i for t in doc]) is True
|
assert is_nonproj_tree([t.head.i for t in doc]) is True
|
||||||
assert is_nonproj_tree(train_goldparse.heads) is False
|
assert is_nonproj_tree(train_goldparse.heads) is False
|
||||||
|
@ -364,27 +323,31 @@ def test_projective_train_vs_nonprojective_dev(doc):
|
||||||
assert deps == dev_goldparse.labels
|
assert deps == dev_goldparse.labels
|
||||||
|
|
||||||
|
|
||||||
|
# Hm, not sure where misalignment check would be handled? In the components too?
|
||||||
|
# I guess that does make sense. A text categorizer doesn't care if it's
|
||||||
|
# misaligned...
|
||||||
|
@pytest.mark.xfail # TODO
|
||||||
def test_ignore_misaligned(doc):
|
def test_ignore_misaligned(doc):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
text = doc.text
|
text = doc.text
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
jsonl_file = tmpdir / "test.jsonl"
|
json_file = tmpdir / "test.json"
|
||||||
data = [docs_to_json(doc)]
|
data = [docs_to_json(doc)]
|
||||||
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
||||||
# write to JSONL train dicts
|
# write to JSON train dicts
|
||||||
srsly.write_jsonl(jsonl_file, data)
|
srsly.write_json(json_file, data)
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
goldcorpus = GoldCorpus(str(json_file), str(json_file))
|
||||||
|
|
||||||
with pytest.raises(AlignmentError):
|
with pytest.raises(AlignmentError):
|
||||||
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
|
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
|
||||||
|
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
jsonl_file = tmpdir / "test.jsonl"
|
json_file = tmpdir / "test.json"
|
||||||
data = [docs_to_json(doc)]
|
data = [docs_to_json(doc)]
|
||||||
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
|
||||||
# write to JSONL train dicts
|
# write to JSON train dicts
|
||||||
srsly.write_jsonl(jsonl_file, data)
|
srsly.write_json(json_file, data)
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
goldcorpus = GoldCorpus(str(json_file), str(json_file))
|
||||||
|
|
||||||
# doesn't raise an AlignmentError, but there is nothing to iterate over
|
# doesn't raise an AlignmentError, but there is nothing to iterate over
|
||||||
# because the only example can't be aligned
|
# because the only example can't be aligned
|
||||||
|
@ -395,14 +358,14 @@ def test_ignore_misaligned(doc):
|
||||||
def test_make_orth_variants(doc):
|
def test_make_orth_variants(doc):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
jsonl_file = tmpdir / "test.jsonl"
|
json_file = tmpdir / "test.json"
|
||||||
# write to JSONL train dicts
|
# write to JSON train dicts
|
||||||
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
srsly.write_json(json_file, [docs_to_json(doc)])
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
goldcorpus = GoldCorpus(str(json_file), str(json_file))
|
||||||
|
|
||||||
# due to randomness, test only that this runs with no errors for now
|
# due to randomness, test only that this runs with no errors for now
|
||||||
train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
|
train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
|
||||||
train_goldparse = train_reloaded_example.gold # noqa: F841
|
train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -456,20 +419,6 @@ def test_gold_constructor():
|
||||||
assert gold.words == ["This", "is", "a", "sentence"]
|
assert gold.words == ["This", "is", "a", "sentence"]
|
||||||
|
|
||||||
|
|
||||||
def test_gold_orig_annot():
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp("This is a sentence")
|
|
||||||
gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0})
|
|
||||||
|
|
||||||
assert gold.orig.words == ["This", "is", "a", "sentence"]
|
|
||||||
assert gold.cats["cat1"]
|
|
||||||
|
|
||||||
doc_annotation = DocAnnotation(cats={"cat1": 0.0, "cat2": 1.0})
|
|
||||||
gold2 = GoldParse.from_annotation(doc, doc_annotation, gold.orig)
|
|
||||||
assert gold2.orig.words == ["This", "is", "a", "sentence"]
|
|
||||||
assert not gold2.cats["cat1"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_tuple_format_implicit():
|
def test_tuple_format_implicit():
|
||||||
"""Test tuple format with implicit GoldParse creation"""
|
"""Test tuple format with implicit GoldParse creation"""
|
||||||
|
|
||||||
|
@ -485,6 +434,7 @@ def test_tuple_format_implicit():
|
||||||
_train(train_data)
|
_train(train_data)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail # TODO
|
||||||
def test_tuple_format_implicit_invalid():
|
def test_tuple_format_implicit_invalid():
|
||||||
"""Test that an error is thrown for an implicit invalid GoldParse field"""
|
"""Test that an error is thrown for an implicit invalid GoldParse field"""
|
||||||
|
|
||||||
|
@ -518,43 +468,51 @@ def _train(train_data):
|
||||||
|
|
||||||
def test_split_sents(merged_dict):
|
def test_split_sents(merged_dict):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
example = Example()
|
example = Example.from_dict(
|
||||||
example.set_token_annotation(**merged_dict)
|
Doc(nlp.vocab, words=merged_dict["words"]),
|
||||||
assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
|
merged_dict
|
||||||
assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1
|
)
|
||||||
|
assert len(get_parses_from_example(
|
||||||
|
example,
|
||||||
|
merge=False,
|
||||||
|
vocab=nlp.vocab,
|
||||||
|
make_projective=False)
|
||||||
|
) == 2
|
||||||
|
assert len(get_parses_from_example(
|
||||||
|
example,
|
||||||
|
merge=True,
|
||||||
|
vocab=nlp.vocab,
|
||||||
|
make_projective=False
|
||||||
|
)) == 1
|
||||||
|
|
||||||
split_examples = example.split_sents()
|
split_examples = example.split_sents()
|
||||||
assert len(split_examples) == 2
|
assert len(split_examples) == 2
|
||||||
|
|
||||||
token_annotation_1 = split_examples[0].token_annotation
|
token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
|
||||||
assert token_annotation_1.ids == [1, 2, 3]
|
assert token_annotation_1["words"] == ["Hi", "there", "everyone"]
|
||||||
assert token_annotation_1.words == ["Hi", "there", "everyone"]
|
assert token_annotation_1["tags"] == ["INTJ", "ADV", "PRON"]
|
||||||
assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
|
assert token_annotation_1["sent_starts"] == [1, 0, 0]
|
||||||
assert token_annotation_1.sent_starts == [1, 0, 0]
|
|
||||||
|
|
||||||
token_annotation_2 = split_examples[1].token_annotation
|
token_annotation_2 = split_examples[1].to_dict()["token_annotation"]
|
||||||
assert token_annotation_2.ids == [4, 5, 6, 7]
|
assert token_annotation_2["words"] == ["It", "is", "just", "me"]
|
||||||
assert token_annotation_2.words == ["It", "is", "just", "me"]
|
assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"]
|
||||||
assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"]
|
assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]
|
||||||
assert token_annotation_2.sent_starts == [1, 0, 0, 0]
|
|
||||||
|
|
||||||
|
|
||||||
def test_tuples_to_example(merged_dict):
|
# This fails on some None value? Need to look into that.
|
||||||
ex = Example()
|
@pytest.mark.xfail # TODO
|
||||||
ex.set_token_annotation(**merged_dict)
|
def test_tuples_to_example(vocab, merged_dict):
|
||||||
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
|
||||||
ex.set_doc_annotation(cats=cats)
|
merged_dict = dict(merged_dict)
|
||||||
ex_dict = ex.to_dict()
|
merged_dict["cats"] = cats
|
||||||
|
ex = Example.from_dict(
|
||||||
assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
|
Doc(vocab, words=merged_dict["words"]),
|
||||||
assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
|
merged_dict
|
||||||
assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
|
)
|
||||||
assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
|
words = [token.text for token in ex.reference]
|
||||||
assert ex_dict["doc_annotation"]["cats"] == cats
|
assert words == merged_dict["words"]
|
||||||
|
tags = [token.tag_ for token in ex.reference]
|
||||||
|
assert tags == merged_dict["tags"]
|
||||||
def test_empty_example_goldparse():
|
sent_starts = [token.is_sent_start for token in ex.reference]
|
||||||
nlp = English()
|
assert sent_starts == [bool(v) for v in merged_dict["sent_starts"]]
|
||||||
doc = nlp("")
|
ex.reference.cats == cats
|
||||||
example = Example(doc=doc)
|
|
||||||
assert len(example.get_gold_parses()) == 1
|
|
||||||
|
|
|
@ -19,22 +19,16 @@ def nlp():
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail # TODO
|
||||||
def test_language_update(nlp):
|
def test_language_update(nlp):
|
||||||
text = "hello world"
|
text = "hello world"
|
||||||
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
||||||
wrongkeyannots = {"LABEL": True}
|
wrongkeyannots = {"LABEL": True}
|
||||||
doc = Doc(nlp.vocab, words=text.split(" "))
|
doc = Doc(nlp.vocab, words=text.split(" "))
|
||||||
gold = GoldParse(doc, **annots)
|
|
||||||
# Update with doc and gold objects
|
|
||||||
nlp.update((doc, gold))
|
|
||||||
# Update with text and dict
|
# Update with text and dict
|
||||||
nlp.update((text, annots))
|
nlp.update((text, annots))
|
||||||
# Update with doc object and dict
|
# Update with doc object and dict
|
||||||
nlp.update((doc, annots))
|
nlp.update((doc, annots))
|
||||||
# Update with text and gold object
|
|
||||||
nlp.update((text, gold))
|
|
||||||
# Update with empty doc and gold object
|
|
||||||
nlp.update((None, gold))
|
|
||||||
# Update badly
|
# Update badly
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.update((doc, None))
|
nlp.update((doc, None))
|
||||||
|
@ -44,20 +38,16 @@ def test_language_update(nlp):
|
||||||
|
|
||||||
def test_language_evaluate(nlp):
|
def test_language_evaluate(nlp):
|
||||||
text = "hello world"
|
text = "hello world"
|
||||||
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
annots = {
|
||||||
|
"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
|
||||||
|
}
|
||||||
doc = Doc(nlp.vocab, words=text.split(" "))
|
doc = Doc(nlp.vocab, words=text.split(" "))
|
||||||
gold = GoldParse(doc, **annots)
|
|
||||||
# Evaluate with doc and gold objects
|
|
||||||
nlp.evaluate([(doc, gold)])
|
|
||||||
# Evaluate with text and dict
|
# Evaluate with text and dict
|
||||||
nlp.evaluate([(text, annots)])
|
nlp.evaluate([(text, annots)])
|
||||||
# Evaluate with doc object and dict
|
# Evaluate with doc object and dict
|
||||||
nlp.evaluate([(doc, annots)])
|
nlp.evaluate([(doc, annots)])
|
||||||
# Evaluate with text and gold object
|
|
||||||
nlp.evaluate([(text, gold)])
|
|
||||||
# Evaluate badly
|
|
||||||
with pytest.raises(Exception):
|
with pytest.raises(Exception):
|
||||||
nlp.evaluate([text, gold])
|
nlp.evaluate([text, annots])
|
||||||
|
|
||||||
|
|
||||||
def test_evaluate_no_pipe(nlp):
|
def test_evaluate_no_pipe(nlp):
|
||||||
|
|
186
spacy/tests/test_new_example.py
Normal file
186
spacy/tests/test_new_example.py
Normal file
|
@ -0,0 +1,186 @@
|
||||||
|
import pytest
|
||||||
|
from spacy.gold.new_example import NewExample as Example
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
def test_Example_init_requires_doc_objects():
|
||||||
|
vocab = Vocab()
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
eg = Example(None, None)
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
eg = Example(Doc(vocab, words=["hi"]), None)
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
eg = Example(None, Doc(vocab, words=["hi"]))
|
||||||
|
|
||||||
|
|
||||||
|
def test_Example_from_dict_basic():
|
||||||
|
eg = Example.from_dict(
|
||||||
|
Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]}
|
||||||
|
)
|
||||||
|
assert isinstance(eg.x, Doc)
|
||||||
|
assert isinstance(eg.y, Doc)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"annots", [{"words": ["ice", "cream"], "weirdannots": ["something", "such"]}]
|
||||||
|
)
|
||||||
|
def test_Example_from_dict_invalid(annots):
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
Example.from_dict(predicted, annots)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}])
|
||||||
|
def test_Example_from_dict_with_tags(annots):
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
eg = Example.from_dict(predicted, annots)
|
||||||
|
for i, token in enumerate(eg.reference):
|
||||||
|
assert token.tag_ == annots["tags"][i]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"annots",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"words": ["I", "like", "London", "and", "Berlin", "."],
|
||||||
|
"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
|
||||||
|
"heads": [1, 1, 1, 2, 2, 1],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_Example_from_dict_with_parse(annots):
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
eg = Example.from_dict(predicted, annots)
|
||||||
|
for i, token in enumerate(eg.reference):
|
||||||
|
assert token.dep_ == annots["deps"][i]
|
||||||
|
assert token.head.i == annots["heads"][i]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"annots",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"words": ["Sarah", "'s", "sister", "flew"],
|
||||||
|
"morphs": [
|
||||||
|
"NounType=prop|Number=sing",
|
||||||
|
"Poss=yes",
|
||||||
|
"Number=sing",
|
||||||
|
"Tense=past|VerbForm=fin",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_Example_from_dict_with_morphology(annots):
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
eg = Example.from_dict(predicted, annots)
|
||||||
|
for i, token in enumerate(eg.reference):
|
||||||
|
assert token.morph_ == annots["morphs"][i]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"annots",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"words": ["This", "is", "one", "sentence", "this", "is", "another"],
|
||||||
|
"sent_starts": [1, 0, 0, 0, 1, 0, 0],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_Example_from_dict_with_sent_start(annots):
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
eg = Example.from_dict(predicted, annots)
|
||||||
|
assert len(list(eg.reference.sents)) == 2
|
||||||
|
for i, token in enumerate(eg.reference):
|
||||||
|
assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"annots",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"words": ["This", "is", "a", "sentence"],
|
||||||
|
"cats": {"cat1": 1.0, "cat2": 0.0, "cat3": 0.5},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_Example_from_dict_with_cats(annots):
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
eg = Example.from_dict(predicted, annots)
|
||||||
|
assert len(list(eg.reference.cats)) == 3
|
||||||
|
assert eg.reference.cats["cat1"] == 1.0
|
||||||
|
assert eg.reference.cats["cat2"] == 0.0
|
||||||
|
assert eg.reference.cats["cat3"] == 0.5
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"annots",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||||
|
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_Example_from_dict_with_entities(annots):
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
eg = Example.from_dict(predicted, annots)
|
||||||
|
assert len(list(eg.reference.ents)) == 2
|
||||||
|
assert eg.reference[0].ent_iob_ == "O"
|
||||||
|
assert eg.reference[1].ent_iob_ == "O"
|
||||||
|
assert eg.reference[2].ent_iob_ == "B"
|
||||||
|
assert eg.reference[3].ent_iob_ == "I"
|
||||||
|
assert eg.reference[4].ent_iob_ == "O"
|
||||||
|
assert eg.reference[5].ent_iob_ == "B"
|
||||||
|
assert eg.reference[6].ent_iob_ == "O"
|
||||||
|
assert eg.reference[2].ent_type_ == "LOC"
|
||||||
|
assert eg.reference[3].ent_type_ == "LOC"
|
||||||
|
assert eg.reference[5].ent_type_ == "LOC"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"annots",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||||
|
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
|
||||||
|
"links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_Example_from_dict_with_links(annots):
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
eg = Example.from_dict(predicted, annots)
|
||||||
|
assert eg.reference[0].ent_kb_id_ == ""
|
||||||
|
assert eg.reference[1].ent_kb_id_ == ""
|
||||||
|
assert eg.reference[2].ent_kb_id_ == "Q60"
|
||||||
|
assert eg.reference[3].ent_kb_id_ == "Q60"
|
||||||
|
assert eg.reference[4].ent_kb_id_ == ""
|
||||||
|
assert eg.reference[5].ent_kb_id_ == "Q64"
|
||||||
|
assert eg.reference[6].ent_kb_id_ == ""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"annots",
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
|
||||||
|
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
|
||||||
|
"links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_Example_from_dict_with_links_invalid(annots):
|
||||||
|
vocab = Vocab()
|
||||||
|
predicted = Doc(vocab, words=annots["words"])
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
Example.from_dict(predicted, annots)
|
||||||
|
|
|
@ -1,12 +1,14 @@
|
||||||
from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
||||||
import pytest
|
import pytest
|
||||||
from pytest import approx
|
from pytest import approx
|
||||||
from spacy.gold import Example, GoldParse
|
from spacy.gold import Example, GoldParse, TokenAnnotation
|
||||||
|
from spacy.gold.iob_utils import biluo_tags_from_offsets
|
||||||
from spacy.scorer import Scorer, ROCAUCScore
|
from spacy.scorer import Scorer, ROCAUCScore
|
||||||
from spacy.scorer import _roc_auc_score, _roc_curve
|
from spacy.scorer import _roc_auc_score, _roc_curve
|
||||||
from .util import get_doc
|
from .util import get_doc
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
test_las_apple = [
|
test_las_apple = [
|
||||||
[
|
[
|
||||||
"Apple is looking at buying U.K. startup for $ 1 billion",
|
"Apple is looking at buying U.K. startup for $ 1 billion",
|
||||||
|
@ -134,8 +136,11 @@ def test_ner_per_type(en_vocab):
|
||||||
words=input_.split(" "),
|
words=input_.split(" "),
|
||||||
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
|
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
|
||||||
)
|
)
|
||||||
ex = Example(doc=doc)
|
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
||||||
ex.set_token_annotation(entities=annot["entities"])
|
ex = Example(
|
||||||
|
doc=doc,
|
||||||
|
token_annotation=TokenAnnotation(entities=entities)
|
||||||
|
)
|
||||||
scorer.score(ex)
|
scorer.score(ex)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
|
@ -155,8 +160,11 @@ def test_ner_per_type(en_vocab):
|
||||||
words=input_.split(" "),
|
words=input_.split(" "),
|
||||||
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
|
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
|
||||||
)
|
)
|
||||||
ex = Example(doc=doc)
|
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
||||||
ex.set_token_annotation(entities=annot["entities"])
|
ex = Example(
|
||||||
|
doc=doc,
|
||||||
|
token_annotation=TokenAnnotation(entities=entities)
|
||||||
|
)
|
||||||
scorer.score(ex)
|
scorer.score(ex)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
|
|
|
@ -799,6 +799,8 @@ cdef class Doc:
|
||||||
cdef attr_id_t attr_id
|
cdef attr_id_t attr_id
|
||||||
cdef TokenC* tokens = self.c
|
cdef TokenC* tokens = self.c
|
||||||
cdef int length = len(array)
|
cdef int length = len(array)
|
||||||
|
if length != len(self):
|
||||||
|
raise ValueError("Cannot set array values longer than the document.")
|
||||||
# Get set up for fast loading
|
# Get set up for fast loading
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef int n_attrs = len(attrs)
|
cdef int n_attrs = len(attrs)
|
||||||
|
@ -823,6 +825,13 @@ cdef class Doc:
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
if array[i, col] != 0:
|
if array[i, col] != 0:
|
||||||
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
|
||||||
|
# Verify ENT_IOB are proper integers
|
||||||
|
if ENT_IOB in attrs:
|
||||||
|
iob_strings = Token.iob_strings()
|
||||||
|
col = attrs.index(ENT_IOB)
|
||||||
|
for i in range(length):
|
||||||
|
if array[i, col] not in range(0, len(iob_strings)):
|
||||||
|
raise ValueError(Errors.E985.format(values=iob_strings, value=array[i, col]))
|
||||||
# Now load the data
|
# Now load the data
|
||||||
for i in range(length):
|
for i in range(length):
|
||||||
token = &self.c[i]
|
token = &self.c[i]
|
||||||
|
@ -881,6 +890,32 @@ cdef class Doc:
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
"""Serialize, i.e. export the document contents to a binary string.
|
"""Serialize, i.e. export the document contents to a binary string.
|
||||||
|
|
||||||
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||||
|
all annotations.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/doc#to_bytes
|
||||||
|
"""
|
||||||
|
return srsly.msgpack_dumps(self.to_dict(exclude=exclude, **kwargs))
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
|
"""Deserialize, i.e. import the document contents from a binary string.
|
||||||
|
|
||||||
|
data (bytes): The string to load from.
|
||||||
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
RETURNS (Doc): Itself.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/doc#from_bytes
|
||||||
|
"""
|
||||||
|
return self.from_dict(
|
||||||
|
srsly.msgpack_loads(bytes_data),
|
||||||
|
exclude=exclude,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_dict(self, exclude=tuple(), **kwargs):
|
||||||
|
"""Export the document contents to a dictionary for serialization.
|
||||||
|
|
||||||
exclude (list): String names of serialization fields to exclude.
|
exclude (list): String names of serialization fields to exclude.
|
||||||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||||
all annotations.
|
all annotations.
|
||||||
|
@ -917,9 +952,9 @@ cdef class Doc:
|
||||||
serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
|
serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
|
||||||
if "user_data_values" not in exclude:
|
if "user_data_values" not in exclude:
|
||||||
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
|
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
|
||||||
return util.to_bytes(serializers, exclude)
|
return util.to_dict(serializers, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_dict(self, msg, exclude=tuple(), **kwargs):
|
||||||
"""Deserialize, i.e. import the document contents from a binary string.
|
"""Deserialize, i.e. import the document contents from a binary string.
|
||||||
|
|
||||||
data (bytes): The string to load from.
|
data (bytes): The string to load from.
|
||||||
|
@ -943,7 +978,6 @@ cdef class Doc:
|
||||||
for key in kwargs:
|
for key in kwargs:
|
||||||
if key in deserializers or key in ("user_data",):
|
if key in deserializers or key in ("user_data",):
|
||||||
raise ValueError(Errors.E128.format(arg=key))
|
raise ValueError(Errors.E128.format(arg=key))
|
||||||
msg = util.from_bytes(bytes_data, deserializers, exclude)
|
|
||||||
# Msgpack doesn't distinguish between lists and tuples, which is
|
# Msgpack doesn't distinguish between lists and tuples, which is
|
||||||
# vexing for user data. As a best guess, we *know* that within
|
# vexing for user data. As a best guess, we *know* that within
|
||||||
# keys, we must have tuples. In values we just have to hope
|
# keys, we must have tuples. In values we just have to hope
|
||||||
|
@ -975,6 +1009,7 @@ cdef class Doc:
|
||||||
self.from_array(msg["array_head"][2:], attrs[:, 2:])
|
self.from_array(msg["array_head"][2:], attrs[:, 2:])
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def extend_tensor(self, tensor):
|
def extend_tensor(self, tensor):
|
||||||
"""Concatenate a new tensor onto the doc.tensor object.
|
"""Concatenate a new tensor onto the doc.tensor object.
|
||||||
|
|
||||||
|
|
|
@ -778,6 +778,10 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self.c.ent_iob
|
return self.c.ent_iob
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def iob_strings(cls):
|
||||||
|
return ("", "I", "O", "B")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ent_iob_(self):
|
def ent_iob_(self):
|
||||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||||
|
@ -787,8 +791,7 @@ cdef class Token:
|
||||||
|
|
||||||
RETURNS (str): IOB code of named entity tag.
|
RETURNS (str): IOB code of named entity tag.
|
||||||
"""
|
"""
|
||||||
iob_strings = ("", "I", "O", "B")
|
return self.iob_strings()[self.c.ent_iob]
|
||||||
return iob_strings[self.c.ent_iob]
|
|
||||||
|
|
||||||
property ent_id:
|
property ent_id:
|
||||||
"""RETURNS (uint64): ID of the entity the token is an instance of,
|
"""RETURNS (uint64): ID of the entity the token is an instance of,
|
||||||
|
|
|
@ -819,16 +819,23 @@ def filter_spans(spans):
|
||||||
|
|
||||||
|
|
||||||
def to_bytes(getters, exclude):
|
def to_bytes(getters, exclude):
|
||||||
|
return srsly.msgpack_dumps(to_dict(getters, exclude))
|
||||||
|
|
||||||
|
|
||||||
|
def from_bytes(bytes_data, setters, exclude):
|
||||||
|
return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude)
|
||||||
|
|
||||||
|
|
||||||
|
def to_dict(getters, exclude):
|
||||||
serialized = {}
|
serialized = {}
|
||||||
for key, getter in getters.items():
|
for key, getter in getters.items():
|
||||||
# Split to support file names like meta.json
|
# Split to support file names like meta.json
|
||||||
if key.split(".")[0] not in exclude:
|
if key.split(".")[0] not in exclude:
|
||||||
serialized[key] = getter()
|
serialized[key] = getter()
|
||||||
return srsly.msgpack_dumps(serialized)
|
return serialized
|
||||||
|
|
||||||
|
|
||||||
def from_bytes(bytes_data, setters, exclude):
|
def from_dict(msg, setters, exclude):
|
||||||
msg = srsly.msgpack_loads(bytes_data)
|
|
||||||
for key, setter in setters.items():
|
for key, setter in setters.items():
|
||||||
# Split to support file names like meta.json
|
# Split to support file names like meta.json
|
||||||
if key.split(".")[0] not in exclude and key in msg:
|
if key.split(".")[0] not in exclude and key in msg:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user