Merge from whatif/arrow

This commit is contained in:
Matthew Honnibal 2020-06-14 17:43:59 +02:00
commit d53723aa4f
38 changed files with 2659 additions and 1888 deletions

View File

@ -23,6 +23,8 @@ Options.docstrings = True
PACKAGES = find_packages()
MOD_NAMES = [
"spacy.gold.align",
"spacy.gold.new_example",
"spacy.parts_of_speech",
"spacy.strings",
"spacy.lexeme",
@ -35,13 +37,14 @@ MOD_NAMES = [
"spacy.syntax.stateclass",
"spacy.syntax._state",
"spacy.tokenizer",
"spacy.syntax.gold_parse",
"spacy.syntax.nn_parser",
"spacy.syntax._parser_model",
"spacy.syntax._beam_utils",
"spacy.syntax.nonproj",
"spacy.syntax.transition_system",
"spacy.syntax.arc_eager",
"spacy.gold",
"spacy.gold.gold_io",
"spacy.tokens.doc",
"spacy.tokens.span",
"spacy.tokens.token",

View File

@ -2,6 +2,7 @@ import re
from ...gold import Example
from ...gold import iob_to_biluo, spans_from_biluo_tags, biluo_tags_from_offsets
from ...gold import TokenAnnotation
from ...language import Language
from ...tokens import Doc, Token
from .conll_ner2json import n_sents_info
@ -284,13 +285,8 @@ def example_from_conllu_sentence(
spaces.append(t._.merged_spaceafter)
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
ents = biluo_tags_from_offsets(doc, ent_offsets)
raw = ""
for word, space in zip(words, spaces):
raw += word
if space:
raw += " "
example = Example(doc=raw)
example.set_token_annotation(
example = Example(doc=Doc(vocab, words=words, spaces=spaces))
example.token_annotation = TokenAnnotation(
ids=ids,
words=words,
tags=tags,

View File

@ -13,7 +13,11 @@ from thinc.api import Model, use_pytorch_for_gpu_memory
import random
from ..gold import GoldCorpus
<<<<<<< HEAD
from ..gold import Example
=======
from ..lookups import Lookups
>>>>>>> origin/develop
from .. import util
from ..errors import Errors
from ..ml import models # don't remove - required to load the built-in architectures
@ -223,7 +227,6 @@ def train(
limit = training["limit"]
msg.info("Loading training corpus")
corpus = GoldCorpus(data_paths["train"], data_paths["dev"], limit=limit)
# verify textcat config
if "textcat" in nlp_config["pipeline"]:
textcat_labels = set(nlp.get_pipe("textcat").labels)
@ -281,9 +284,7 @@ def train(
nlp.resume_training()
else:
msg.info(f"Initializing the nlp pipeline: {nlp.pipe_names}")
nlp.begin_training(
lambda: corpus.train_examples
)
nlp.begin_training(lambda: corpus.train_dataset(nlp))
# Update tag map with provided mapping
nlp.vocab.morphology.tag_map.update(tag_map)
@ -373,6 +374,16 @@ def train(
def create_train_batches(nlp, corpus, cfg):
epochs_todo = cfg.get("max_epochs", 0)
while True:
<<<<<<< HEAD
train_examples = list(corpus.train_dataset(
nlp,
noise_level=0.0,
orth_variant_level=cfg["orth_variant_level"],
gold_preproc=cfg["gold_preproc"],
max_length=cfg["max_length"],
ignore_misaligned=True
))
=======
train_examples = list(
corpus.train_dataset(
nlp,
@ -383,6 +394,7 @@ def create_train_batches(nlp, corpus, cfg):
ignore_misaligned=True,
)
)
>>>>>>> origin/develop
if len(train_examples) == 0:
raise ValueError(Errors.E988)
random.shuffle(train_examples)
@ -413,6 +425,7 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
nlp, gold_preproc=cfg["gold_preproc"], ignore_misaligned=True
)
)
n_words = sum(len(ex.doc) for ex in dev_examples)
start_time = timer()

View File

@ -620,6 +620,14 @@ class Errors(object):
E999 = ("Encountered an unexpected format for the dictionary holding "
"gold annotations: {gold_dict}")
# TODO: These were left over after a merge, but I couldn't find them?
#E983 = ("Each link annotation should refer to a dictionary with at most one "
# "identifier mapping to 1.0, and all others to 0.0.")
#E984 = ("The offsets of the annotations for 'links' need to refer exactly "
# "to the offsets of the 'entities' annotations.")
#E985 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
# "into {values}, but found {value}.")
@add_codes
class TempErrors(object):

View File

@ -1,68 +0,0 @@
from cymem.cymem cimport Pool
from .typedefs cimport attr_t
from .syntax.transition_system cimport Transition
from .tokens import Doc
cdef struct GoldParseC:
int* tags
int* heads
int* has_dep
int* sent_start
attr_t* labels
int** brackets
Transition* ner
cdef class GoldParse:
cdef Pool mem
cdef GoldParseC c
cdef readonly TokenAnnotation orig
cdef int length
cdef public int loss
cdef public list words
cdef public list tags
cdef public list pos
cdef public list morphs
cdef public list lemmas
cdef public list sent_starts
cdef public list heads
cdef public list labels
cdef public dict orths
cdef public list ner
cdef public dict brackets
cdef public dict cats
cdef public dict links
cdef readonly list cand_to_gold
cdef readonly list gold_to_cand
cdef class TokenAnnotation:
cdef public list ids
cdef public list words
cdef public list tags
cdef public list pos
cdef public list morphs
cdef public list lemmas
cdef public list heads
cdef public list deps
cdef public list entities
cdef public list sent_starts
cdef public dict brackets_by_start
cdef class DocAnnotation:
cdef public object cats
cdef public object links
cdef class Example:
cdef public object doc
cdef public TokenAnnotation token_annotation
cdef public DocAnnotation doc_annotation
cdef public object goldparse

File diff suppressed because it is too large Load Diff

0
spacy/gold/__init__.pxd Normal file
View File

13
spacy/gold/__init__.py Normal file
View File

@ -0,0 +1,13 @@
from .corpus import GoldCorpus
from ..syntax.gold_parse import GoldParse
from .example import Example
from .annotation import TokenAnnotation, DocAnnotation
from .align import align
from .iob_utils import iob_to_biluo, biluo_to_iob
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags
from .iob_utils import spans_from_biluo_tags
from .iob_utils import tags_to_entities
from .gold_io import docs_to_json
from .gold_io import read_json_file

8
spacy/gold/align.pxd Normal file
View File

@ -0,0 +1,8 @@
cdef class Alignment:
cdef public object cost
cdef public object i2j
cdef public object j2i
cdef public object i2j_multi
cdef public object j2i_multi
cdef public object cand_to_gold
cdef public object gold_to_cand

101
spacy/gold/align.pyx Normal file
View File

@ -0,0 +1,101 @@
import numpy
from ..errors import Errors, AlignmentError
cdef class Alignment:
def __init__(self, spacy_words, gold_words):
# Do many-to-one alignment for misaligned tokens.
# If we over-segment, we'll have one gold word that covers a sequence
# of predicted words
# If we under-segment, we'll have one predicted word that covers a
# sequence of gold words.
# If we "mis-segment", we'll have a sequence of predicted words covering
# a sequence of gold words. That's many-to-many -- we don't do that
# except for NER spans where the start and end can be aligned.
cost, i2j, j2i, i2j_multi, j2i_multi = align(spacy_words, gold_words)
self.cost = cost
self.i2j = i2j
self.j2i = j2i
self.i2j_multi = i2j_multi
self.j2i_multi = j2i_multi
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
def align(tokens_a, tokens_b):
"""Calculate alignment tables between two tokenizations.
tokens_a (List[str]): The candidate tokenization.
tokens_b (List[str]): The reference tokenization.
RETURNS: (tuple): A 5-tuple consisting of the following information:
* cost (int): The number of misaligned tokens.
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
it has the value -1.
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
the same token of `tokens_b`.
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
direction.
"""
tokens_a = _normalize_for_alignment(tokens_a)
tokens_b = _normalize_for_alignment(tokens_b)
cost = 0
a2b = numpy.empty(len(tokens_a), dtype="i")
b2a = numpy.empty(len(tokens_b), dtype="i")
a2b.fill(-1)
b2a.fill(-1)
a2b_multi = {}
b2a_multi = {}
i = 0
j = 0
offset_a = 0
offset_b = 0
while i < len(tokens_a) and j < len(tokens_b):
a = tokens_a[i][offset_a:]
b = tokens_b[j][offset_b:]
if a == b:
if offset_a == offset_b == 0:
a2b[i] = j
b2a[j] = i
elif offset_a == 0:
cost += 2
a2b_multi[i] = j
elif offset_b == 0:
cost += 2
b2a_multi[j] = i
offset_a = offset_b = 0
i += 1
j += 1
elif a == "":
assert offset_a == 0
cost += 1
i += 1
elif b == "":
assert offset_b == 0
cost += 1
j += 1
elif b.startswith(a):
cost += 1
if offset_a == 0:
a2b_multi[i] = j
i += 1
offset_a = 0
offset_b += len(a)
elif a.startswith(b):
cost += 1
if offset_b == 0:
b2a_multi[j] = i
j += 1
offset_b = 0
offset_a += len(b)
else:
assert "".join(tokens_a) != "".join(tokens_b)
raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
return cost, a2b, b2a, a2b_multi, b2a_multi
def _normalize_for_alignment(tokens):
return [w.replace(" ", "").lower() for w in tokens]

150
spacy/gold/annotation.py Normal file
View File

@ -0,0 +1,150 @@
from .iob_utils import biluo_tags_from_offsets
class TokenAnnotation:
def __init__(
self,
ids=None,
words=None,
tags=None,
pos=None,
morphs=None,
lemmas=None,
heads=None,
deps=None,
entities=None,
sent_starts=None,
brackets=None,
):
self.ids = ids if ids else []
self.words = words if words else []
self.tags = tags if tags else []
self.pos = pos if pos else []
self.morphs = morphs if morphs else []
self.lemmas = lemmas if lemmas else []
self.heads = heads if heads else []
self.deps = deps if deps else []
self.entities = entities if entities else []
self.sent_starts = sent_starts if sent_starts else []
self.brackets_by_start = {}
if brackets:
for b_start, b_end, b_label in brackets:
self.brackets_by_start.setdefault(b_start, []).append((b_end, b_label))
def get_field(self, field):
if field == "id":
return self.ids
elif field == "word":
return self.words
elif field == "tag":
return self.tags
elif field == "pos":
return self.pos
elif field == "morph":
return self.morphs
elif field == "lemma":
return self.lemmas
elif field == "head":
return self.heads
elif field == "dep":
return self.deps
elif field == "ner":
return self.entities
elif field == "sent_start":
return self.sent_starts
else:
raise ValueError(f"Unknown field: {field}")
@property
def brackets(self):
brackets = []
for start, ends_labels in self.brackets_by_start.items():
for end, label in ends_labels:
brackets.append((start, end, label))
return brackets
@classmethod
def from_dict(cls, token_dict):
return cls(
ids=token_dict.get("ids", None),
words=token_dict.get("words", None),
tags=token_dict.get("tags", None),
pos=token_dict.get("pos", None),
morphs=token_dict.get("morphs", None),
lemmas=token_dict.get("lemmas", None),
heads=token_dict.get("heads", None),
deps=token_dict.get("deps", None),
entities=token_dict.get("entities", None),
sent_starts=token_dict.get("sent_starts", None),
brackets=token_dict.get("brackets", None),
)
def to_dict(self):
return {
"ids": self.ids,
"words": self.words,
"tags": self.tags,
"pos": self.pos,
"morphs": self.morphs,
"lemmas": self.lemmas,
"heads": self.heads,
"deps": self.deps,
"entities": self.entities,
"sent_starts": self.sent_starts,
"brackets": self.brackets,
}
def get_id(self, i):
return self.ids[i] if i < len(self.ids) else i
def get_word(self, i):
return self.words[i] if i < len(self.words) else ""
def get_tag(self, i):
return self.tags[i] if i < len(self.tags) else "-"
def get_pos(self, i):
return self.pos[i] if i < len(self.pos) else ""
def get_morph(self, i):
return self.morphs[i] if i < len(self.morphs) else ""
def get_lemma(self, i):
return self.lemmas[i] if i < len(self.lemmas) else ""
def get_head(self, i):
return self.heads[i] if i < len(self.heads) else i
def get_dep(self, i):
return self.deps[i] if i < len(self.deps) else ""
def get_entity(self, i):
return self.entities[i] if i < len(self.entities) else "-"
def get_sent_start(self, i):
return self.sent_starts[i] if i < len(self.sent_starts) else None
def __str__(self):
return str(self.to_dict())
def __repr__(self):
return self.__str__()
class DocAnnotation:
def __init__(self, cats=None, links=None):
self.cats = cats if cats else {}
self.links = links if links else {}
@classmethod
def from_dict(cls, doc_dict):
return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None))
def to_dict(self):
return {"cats": self.cats, "links": self.links}
def __str__(self):
return str(self.to_dict())
def __repr__(self):
return self.__str__()

131
spacy/gold/augment.py Normal file
View File

@ -0,0 +1,131 @@
import random
import itertools
from .example import Example
from .annotation import TokenAnnotation
def make_orth_variants(nlp, example, orth_variant_level=0.0):
if random.random() >= orth_variant_level:
return example
if not example.token_annotation:
return example
raw = example.text
lower = False
if random.random() >= 0.5:
lower = True
if raw is not None:
raw = raw.lower()
ndsv = nlp.Defaults.single_orth_variants
ndpv = nlp.Defaults.paired_orth_variants
# modify words in paragraph_tuples
variant_example = Example(doc=nlp.make_doc(raw))
token_annotation = example.token_annotation
words = token_annotation.words
tags = token_annotation.tags
if not words or not tags:
# add the unmodified annotation
token_dict = token_annotation.to_dict()
variant_example.token_annotation = TokenAnnotation(**token_dict)
else:
if lower:
words = [w.lower() for w in words]
# single variants
punct_choices = [random.choice(x["variants"]) for x in ndsv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndsv)):
if (
tags[word_idx] in ndsv[punct_idx]["tags"]
and words[word_idx] in ndsv[punct_idx]["variants"]
):
words[word_idx] = punct_choices[punct_idx]
# paired variants
punct_choices = [random.choice(x["variants"]) for x in ndpv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndpv)):
if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
word_idx
] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
# backup option: random left vs. right from pair
pair_idx = random.choice([0, 1])
# best option: rely on paired POS tags like `` / ''
if len(ndpv[punct_idx]["tags"]) == 2:
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
# next best option: rely on position in variants
# (may not be unambiguous, so order of variants matters)
else:
for pair in ndpv[punct_idx]["variants"]:
if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx]
token_dict = token_annotation.to_dict()
token_dict["words"] = words
token_dict["tags"] = tags
variant_example.token_annotation = TokenAnnotation(**token_dict)
# modify raw to match variant_paragraph_tuples
if raw is not None:
variants = []
for single_variants in ndsv:
variants.extend(single_variants["variants"])
for paired_variants in ndpv:
variants.extend(
list(itertools.chain.from_iterable(paired_variants["variants"]))
)
# store variants in reverse length order to be able to prioritize
# longer matches (e.g., "---" before "--")
variants = sorted(variants, key=lambda x: len(x))
variants.reverse()
variant_raw = ""
raw_idx = 0
# add initial whitespace
while raw_idx < len(raw) and raw[raw_idx].isspace():
variant_raw += raw[raw_idx]
raw_idx += 1
for word in variant_example.token_annotation.words:
match_found = False
# skip whitespace words
if word.isspace():
match_found = True
# add identical word
elif word not in variants and raw[raw_idx:].startswith(word):
variant_raw += word
raw_idx += len(word)
match_found = True
# add variant word
else:
for variant in variants:
if not match_found and raw[raw_idx:].startswith(variant):
raw_idx += len(variant)
variant_raw += word
match_found = True
# something went wrong, abort
# (add a warning message?)
if not match_found:
return example
# add following whitespace
while raw_idx < len(raw) and raw[raw_idx].isspace():
variant_raw += raw[raw_idx]
raw_idx += 1
variant_example.doc = variant_raw
return variant_example
return variant_example
def add_noise(orig, noise_level):
if random.random() >= noise_level:
return orig
elif type(orig) == list:
corrupted = [_corrupt(word, noise_level) for word in orig]
corrupted = [w for w in corrupted if w]
return corrupted
else:
return "".join(_corrupt(c, noise_level) for c in orig)
def _corrupt(c, noise_level):
if random.random() >= noise_level:
return c
elif c in [".", "'", "!", "?", ","]:
return "\n"
else:
return c.lower()

226
spacy/gold/corpus.py Normal file
View File

@ -0,0 +1,226 @@
import random
import shutil
import tempfile
import srsly
from pathlib import Path
import itertools
from ..tokens import Doc
from .. import util
from ..errors import Errors, AlignmentError
from .gold_io import read_json_file, json_to_annotations
from .augment import make_orth_variants, add_noise
from .new_example import NewExample as Example
class GoldCorpus(object):
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER.
DOCS: https://spacy.io/api/goldcorpus
"""
def __init__(self, train, dev, gold_preproc=False, limit=None):
"""Create a GoldCorpus.
train (str / Path): File or directory of training data.
dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
"""
self.limit = limit
if isinstance(train, str) or isinstance(train, Path):
train = self.read_annotations(self.walk_corpus(train))
dev = self.read_annotations(self.walk_corpus(dev))
# Write temp directory with one doc per file, so we can shuffle and stream
self.tmp_dir = Path(tempfile.mkdtemp())
self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
def __del__(self):
shutil.rmtree(self.tmp_dir)
@staticmethod
def write_msgpack(directory, examples, limit=0):
if not directory.exists():
directory.mkdir()
n = 0
for i, ex_dict in enumerate(examples):
text = ex_dict["text"]
srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
n += 1
if limit and n >= limit:
break
@staticmethod
def walk_corpus(path):
path = util.ensure_path(path)
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith((".json", ".jsonl")):
locs.append(path)
return locs
@staticmethod
def read_annotations(locs, limit=0):
""" Yield training examples """
i = 0
for loc in locs:
loc = util.ensure_path(loc)
file_name = loc.parts[-1]
if file_name.endswith("json"):
examples = read_json_file(loc)
elif file_name.endswith("jsonl"):
gold_tuples = srsly.read_jsonl(loc)
first_gold_tuple = next(gold_tuples)
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
# TODO: proper format checks with schemas
if isinstance(first_gold_tuple, dict):
if first_gold_tuple.get("paragraphs", None):
examples = []
for json_doc in gold_tuples:
examples.extend(json_to_annotations(json_doc))
elif first_gold_tuple.get("doc_annotation", None):
examples = []
for ex_dict in gold_tuples:
doc = ex_dict.get("doc", None)
if doc is None:
doc = ex_dict.get("text", None)
if not (
doc is None
or isinstance(doc, Doc)
or isinstance(doc, str)
):
raise ValueError(Errors.E987.format(type=type(doc)))
examples.append(ex_dict)
elif file_name.endswith("msg"):
text, ex_dict = srsly.read_msgpack(loc)
examples = [ex_dict]
else:
supported = ("json", "jsonl", "msg")
raise ValueError(Errors.E124.format(path=loc, formats=supported))
try:
for example in examples:
yield example
i += 1
if limit and i >= limit:
return
except KeyError as e:
msg = "Missing key {}".format(e)
raise KeyError(Errors.E996.format(file=file_name, msg=msg))
except UnboundLocalError as e:
msg = "Unexpected document structure"
raise ValueError(Errors.E996.format(file=file_name, msg=msg))
@property
def dev_annotations(self):
locs = (self.tmp_dir / "dev").iterdir()
yield from self.read_annotations(locs, limit=self.limit)
@property
def train_annotations(self):
locs = (self.tmp_dir / "train").iterdir()
yield from self.read_annotations(locs, limit=self.limit)
def count_train(self):
"""Returns count of words in train examples"""
n = 0
i = 0
for eg_dict in self.train_annotations:
n += len(eg_dict["token_annotation"]["words"])
if self.limit and i >= self.limit:
break
i += 1
return n
def train_dataset(
self,
nlp,
gold_preproc=False,
max_length=None,
noise_level=0.0,
orth_variant_level=0.0,
ignore_misaligned=False,
):
locs = list((self.tmp_dir / "train").iterdir())
random.shuffle(locs)
train_annotations = self.read_annotations(locs, limit=self.limit)
examples = self.iter_examples(
nlp,
train_annotations,
gold_preproc,
max_length=max_length,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
make_projective=True,
ignore_misaligned=ignore_misaligned,
)
yield from examples
def train_dataset_without_preprocessing(
self, nlp, gold_preproc=False, ignore_misaligned=False
):
examples = self.iter_examples(
nlp,
self.train_annotations,
gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned,
)
yield from examples
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
examples = self.iter_examples(
nlp,
self.dev_annotations,
gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned,
)
yield from examples
@classmethod
def iter_examples(
cls,
nlp,
annotations,
gold_preproc,
max_length=None,
noise_level=0.0,
orth_variant_level=0.0,
make_projective=False,
ignore_misaligned=False,
):
""" Setting gold_preproc will result in creating a doc per sentence """
for eg_dict in annotations:
if eg_dict["text"]:
example = Example.from_dict(
nlp.make_doc(eg_dict["text"]),
eg_dict
)
else:
example = Example.from_dict(
Doc(nlp.vocab, words=eg_dict["words"]),
eg_dict
)
if gold_preproc:
# TODO: Data augmentation
examples = example.split_sents()
else:
examples = [example]
for ex in examples:
if (not max_length) or len(ex.predicted) < max_length:
if ignore_misaligned:
try:
_ = ex._deprecated_get_gold()
except AlignmentError:
continue
yield ex

261
spacy/gold/example.py Normal file
View File

@ -0,0 +1,261 @@
import numpy
from .annotation import TokenAnnotation, DocAnnotation
from .iob_utils import spans_from_biluo_tags, biluo_tags_from_offsets
from .align import Alignment
from ..errors import Errors, AlignmentError
from ..tokens import Doc
def annotations2doc(doc, doc_annot, tok_annot):
# TODO: Improve and test this
words = tok_annot.words or [tok.text for tok in doc]
fields = {
"tags": "TAG",
"pos": "POS",
"lemmas": "LEMMA",
"deps": "DEP",
}
attrs = []
values = []
for field, attr in fields.items():
value = getattr(tok_annot, field)
# Unset fields will be empty lists.
if value:
attrs.append(attr)
values.append([doc.vocab.strings.add(v) for v in value])
if tok_annot.heads:
attrs.append("HEAD")
values.append([h - i for i, h in enumerate(tok_annot.heads)])
output = Doc(doc.vocab, words=words)
if values:
array = numpy.array(values, dtype="uint64")
output = output.from_array(attrs, array.T)
if tok_annot.entities:
output.ents = spans_from_biluo_tags(output, tok_annot.entities)
doc.cats = dict(doc_annot.cats)
# TODO: Calculate token.ent_kb_id from links.
# We need to fix this and the doc.ents thing, both should be doc
# annotations.
return doc
class Example:
def __init__(self, doc, doc_annotation=None, token_annotation=None):
""" Doc can either be text, or an actual Doc """
if not isinstance(doc, Doc):
raise TypeError("Must pass Doc instance")
self.predicted = doc
self.doc = doc
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
self.token_annotation = (
token_annotation if token_annotation else TokenAnnotation()
)
self._alignment = None
self.reference = annotations2doc(
self.doc,
self.doc_annotation,
self.token_annotation
)
@property
def x(self):
return self.predicted
@property
def y(self):
return self.reference
def _deprecated_get_gold(self, make_projective=False):
from ..syntax.gold_parse import get_parses_from_example
_, gold = get_parses_from_example(self, make_projective=make_projective)[0]
return gold
@classmethod
def from_dict(cls, example_dict, doc=None):
if example_dict is None:
raise ValueError("Example.from_dict expected dict, received None")
if doc is None:
raise ValueError("Must pass doc")
# TODO: This is ridiculous...
token_dict = example_dict.get("token_annotation", {})
doc_dict = example_dict.get("doc_annotation", {})
for key, value in example_dict.items():
if key in ("token_annotation", "doc_annotation"):
pass
elif key in ("cats", "links"):
doc_dict[key] = value
else:
token_dict[key] = value
if token_dict.get("entities"):
entities = token_dict["entities"]
if isinstance(entities[0], (list, tuple)):
token_dict["entities"] = biluo_tags_from_offsets(doc, entities)
token_annotation = TokenAnnotation.from_dict(token_dict)
doc_annotation = DocAnnotation.from_dict(doc_dict)
return cls(
doc=doc, doc_annotation=doc_annotation, token_annotation=token_annotation
)
@property
def alignment(self):
if self._alignment is None:
if self.doc is None:
return None
spacy_words = [token.orth_ for token in self.predicted]
gold_words = [token.orth_ for token in self.reference]
if gold_words == []:
gold_words = spacy_words
self._alignment = Alignment(spacy_words, gold_words)
return self._alignment
def to_dict(self):
""" Note that this method does NOT export the doc, only the annotations ! """
token_dict = self.token_annotation.to_dict()
doc_dict = self.doc_annotation.to_dict()
return {"token_annotation": token_dict, "doc_annotation": doc_dict}
@property
def text(self):
if self.doc is None:
return None
if isinstance(self.doc, Doc):
return self.doc.text
return self.doc
def get_aligned(self, field):
"""Return an aligned array for a token annotation field."""
if self.doc is None:
return self.token_annotation.get_field(field)
doc = self.doc
if field == "word":
return [token.orth_ for token in doc]
gold_values = self.token_annotation.get_field(field)
alignment = self.alignment
i2j_multi = alignment.i2j_multi
gold_to_cand = alignment.gold_to_cand
cand_to_gold = alignment.cand_to_gold
output = []
for i, gold_i in enumerate(cand_to_gold):
if doc[i].text.isspace():
output.append(None)
elif gold_i is None:
if i in i2j_multi:
output.append(gold_values[i2j_multi[i]])
else:
output.append(None)
else:
output.append(gold_values[gold_i])
return output
def set_doc_annotation(self, cats=None, links=None):
if cats:
self.doc_annotation.cats = cats
if links:
self.doc_annotation.links = links
def split_sents(self):
""" Split the token annotations into multiple Examples based on
sent_starts and return a list of the new Examples"""
if not self.token_annotation.words:
return [self]
s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
s_brackets = []
sent_start_i = 0
t = self.token_annotation
split_examples = []
for i in range(len(t.words)):
if i > 0 and t.sent_starts[i] == 1:
split_examples.append(
Example(
doc=Doc(self.doc.vocab, words=s_words),
token_annotation=TokenAnnotation(
ids=s_ids,
words=s_words,
tags=s_tags,
pos=s_pos,
morphs=s_morphs,
lemmas=s_lemmas,
heads=s_heads,
deps=s_deps,
entities=s_ents,
sent_starts=s_sent_starts,
brackets=s_brackets,
),
doc_annotation=self.doc_annotation
)
)
s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
s_sent_starts, s_brackets = [], []
sent_start_i = i
s_ids.append(t.get_id(i))
s_words.append(t.get_word(i))
s_tags.append(t.get_tag(i))
s_pos.append(t.get_pos(i))
s_morphs.append(t.get_morph(i))
s_lemmas.append(t.get_lemma(i))
s_heads.append(t.get_head(i) - sent_start_i)
s_deps.append(t.get_dep(i))
s_ents.append(t.get_entity(i))
s_sent_starts.append(t.get_sent_start(i))
for b_end, b_label in t.brackets_by_start.get(i, []):
s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
i += 1
split_examples.append(
Example(
doc=Doc(self.doc.vocab, words=s_words),
token_annotation=TokenAnnotation(
ids=s_ids,
words=s_words,
tags=s_tags,
pos=s_pos,
morphs=s_morphs,
lemmas=s_lemmas,
heads=s_heads,
deps=s_deps,
entities=s_ents,
sent_starts=s_sent_starts,
brackets=s_brackets,
),
doc_annotation=self.doc_annotation
)
)
return split_examples
@classmethod
def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
"""
Return a list of Example objects, from a variety of input formats.
make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
"""
if isinstance(examples, Example):
return [examples]
if isinstance(examples, tuple):
examples = [examples]
converted_examples = []
for ex in examples:
if isinstance(ex, Example):
converted_examples.append(ex)
# convert string to Doc to Example
elif isinstance(ex, str):
if keep_raw_text:
converted_examples.append(Example(doc=ex))
else:
doc = make_doc(ex)
converted_examples.append(Example(doc=doc))
# convert tuples to Example
elif isinstance(ex, tuple) and len(ex) == 2:
doc, gold = ex
# convert string to Doc
if isinstance(doc, str) and not keep_raw_text:
doc = make_doc(doc)
converted_examples.append(Example.from_dict(gold, doc=doc))
# convert Doc to Example
elif isinstance(ex, Doc):
converted_examples.append(Example(doc=ex))
else:
converted_examples.append(ex)
return converted_examples

198
spacy/gold/gold_io.pyx Normal file
View File

@ -0,0 +1,198 @@
import warnings
import srsly
from .. import util
from ..errors import Warnings
from ..tokens import Token, Doc
from .iob_utils import biluo_tags_from_offsets
def merge_sents(sents):
m_deps = [[], [], [], [], [], []]
m_cats = {}
m_brackets = []
i = 0
for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
m_deps[0].extend(id_ + i for id_ in ids)
m_deps[1].extend(words)
m_deps[2].extend(tags)
m_deps[3].extend(head + i for head in heads)
m_deps[4].extend(labels)
m_deps[5].extend(ner)
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
for b in brackets)
m_cats.update(cats)
i += len(ids)
return [(m_deps, (m_cats, m_brackets))]
def docs_to_json(docs, id=0, ner_missing_tag="O"):
"""Convert a list of Doc objects into the JSON-serializable format used by
the spacy train command.
docs (iterable / Doc): The Doc object(s) to convert.
id (int): Id for the JSON.
RETURNS (dict): The data in spaCy's JSON format
- each input doc will be treated as a paragraph in the output doc
"""
if isinstance(docs, Doc):
docs = [docs]
json_doc = {"id": id, "paragraphs": []}
for i, doc in enumerate(docs):
json_para = {'raw': doc.text, "sentences": [], "cats": []}
for cat, val in doc.cats.items():
json_cat = {"label": cat, "value": val}
json_para["cats"].append(json_cat)
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
for j, sent in enumerate(doc.sents):
json_sent = {"tokens": [], "brackets": []}
for token in sent:
json_token = {"id": token.i, "orth": token.text}
if doc.is_tagged:
json_token["tag"] = token.tag_
json_token["pos"] = token.pos_
json_token["morph"] = token.morph_
json_token["lemma"] = token.lemma_
if doc.is_parsed:
json_token["head"] = token.head.i-token.i
json_token["dep"] = token.dep_
json_token["ner"] = biluo_tags[token.i]
json_sent["tokens"].append(json_token)
json_para["sentences"].append(json_sent)
json_doc["paragraphs"].append(json_para)
return json_doc
def read_json_file(loc, docs_filter=None, limit=None):
loc = util.ensure_path(loc)
if loc.is_dir():
for filename in loc.iterdir():
yield from read_json_file(loc / filename, limit=limit)
else:
for doc in json_iterate(loc):
if docs_filter is not None and not docs_filter(doc):
continue
for json_data in json_to_annotations(doc):
yield json_data
def json_to_annotations(doc):
"""Convert an item in the JSON-formatted training data to the format
used by GoldParse.
doc (dict): One entry in the training data.
YIELDS (tuple): The reformatted data - one training example per paragraph
"""
for paragraph in doc["paragraphs"]:
example = {"text": paragraph.get("raw", None)}
words = []
ids = []
tags = []
pos = []
morphs = []
lemmas = []
heads = []
labels = []
ner = []
sent_starts = []
brackets = []
for sent in paragraph["sentences"]:
sent_start_i = len(words)
for i, token in enumerate(sent["tokens"]):
words.append(token["orth"])
ids.append(token.get('id', sent_start_i + i))
tags.append(token.get('tag', "-"))
pos.append(token.get("pos", ""))
morphs.append(token.get("morph", ""))
lemmas.append(token.get("lemma", ""))
heads.append(token.get("head", 0) + sent_start_i + i)
labels.append(token.get("dep", ""))
# Ensure ROOT label is case-insensitive
if labels[-1].lower() == "root":
labels[-1] = "ROOT"
ner.append(token.get("ner", "-"))
if i == 0:
sent_starts.append(1)
else:
sent_starts.append(0)
if "brackets" in sent:
brackets.extend((b["first"] + sent_start_i,
b["last"] + sent_start_i, b["label"])
for b in sent["brackets"])
cats = {}
for cat in paragraph.get("cats", {}):
cats[cat["label"]] = cat["value"]
example["token_annotation"] = dict(
ids=ids,
words=words,
tags=tags,
pos=pos,
morphs=morphs,
lemmas=lemmas,
heads=heads,
deps=labels,
entities=ner,
sent_starts=sent_starts,
brackets=brackets
)
example["doc_annotation"] = dict(cats=cats)
yield example
def json_iterate(loc):
# We should've made these files jsonl...But since we didn't, parse out
# the docs one-by-one to reduce memory usage.
# It's okay to read in the whole file -- just don't parse it into JSON.
cdef bytes py_raw
loc = util.ensure_path(loc)
with loc.open("rb") as file_:
py_raw = file_.read()
cdef long file_length = len(py_raw)
if file_length > 2 ** 30:
warnings.warn(Warnings.W027.format(size=file_length))
raw = <char*>py_raw
cdef int square_depth = 0
cdef int curly_depth = 0
cdef int inside_string = 0
cdef int escape = 0
cdef long start = -1
cdef char c
cdef char quote = ord('"')
cdef char backslash = ord("\\")
cdef char open_square = ord("[")
cdef char close_square = ord("]")
cdef char open_curly = ord("{")
cdef char close_curly = ord("}")
for i in range(file_length):
c = raw[i]
if escape:
escape = False
continue
if c == backslash:
escape = True
continue
if c == quote:
inside_string = not inside_string
continue
if inside_string:
continue
if c == open_square:
square_depth += 1
elif c == close_square:
square_depth -= 1
elif c == open_curly:
if square_depth == 1 and curly_depth == 0:
start = i
curly_depth += 1
elif c == close_curly:
curly_depth -= 1
if square_depth == 1 and curly_depth == 0:
py_str = py_raw[start : i + 1].decode("utf8")
try:
yield srsly.json_loads(py_str)
except Exception:
print(py_str)
raise
start = -1

197
spacy/gold/iob_utils.py Normal file
View File

@ -0,0 +1,197 @@
import warnings
from ..errors import Errors, Warnings
from ..tokens import Span
def iob_to_biluo(tags):
out = []
tags = list(tags)
while tags:
out.extend(_consume_os(tags))
out.extend(_consume_ent(tags))
return out
def biluo_to_iob(tags):
out = []
for tag in tags:
tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1)
out.append(tag)
return out
def _consume_os(tags):
while tags and tags[0] == "O":
yield tags.pop(0)
def _consume_ent(tags):
if not tags:
return []
tag = tags.pop(0)
target_in = "I" + tag[1:]
target_last = "L" + tag[1:]
length = 1
while tags and tags[0] in {target_in, target_last}:
length += 1
tags.pop(0)
label = tag[2:]
if length == 1:
if len(label) == 0:
raise ValueError(Errors.E177.format(tag=tag))
return ["U-" + label]
else:
start = "B-" + label
end = "L-" + label
middle = [f"I-{label}" for _ in range(1, length - 1)]
return [start] + middle + [end]
def biluo_tags_from_doc(doc, missing="O"):
return biluo_tags_from_offsets(
doc,
[(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
missing=missing
)
def biluo_tags_from_offsets(doc, entities, missing="O"):
"""Encode labelled spans into per-token tags, using the
Begin/In/Last/Unit/Out scheme (BILUO).
doc (Doc): The document that the entity offsets refer to. The output tags
will refer to the token boundaries within the document.
entities (iterable): A sequence of `(start, end, label)` triples. `start`
and `end` should be character-offset integers denoting the slice into
the original string.
RETURNS (list): A list of unicode strings, describing the tags. Each tag
string will be of the form either "", "O" or "{action}-{label}", where
action is one of "B", "I", "L", "U". The string "-" is used where the
entity offsets don't align with the tokenization in the `Doc` object.
The training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
EXAMPLE:
>>> text = 'I like London.'
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
>>> doc = nlp.tokenizer(text)
>>> tags = biluo_tags_from_offsets(doc, entities)
>>> assert tags == ["O", "O", 'U-LOC', "O"]
"""
# Ensure no overlapping entity labels exist
tokens_in_ents = {}
starts = {token.idx: token.i for token in doc}
ends = {token.idx + len(token): token.i for token in doc}
biluo = ["-" for _ in doc]
# Handle entity cases
for start_char, end_char, label in entities:
for token_index in range(start_char, end_char):
if token_index in tokens_in_ents.keys():
raise ValueError(
Errors.E103.format(
span1=(
tokens_in_ents[token_index][0],
tokens_in_ents[token_index][1],
tokens_in_ents[token_index][2],
),
span2=(start_char, end_char, label),
)
)
tokens_in_ents[token_index] = (start_char, end_char, label)
start_token = starts.get(start_char)
end_token = ends.get(end_char)
# Only interested if the tokenization is correct
if start_token is not None and end_token is not None:
if start_token == end_token:
biluo[start_token] = f"U-{label}"
else:
biluo[start_token] = f"B-{label}"
for i in range(start_token + 1, end_token):
biluo[i] = f"I-{label}"
biluo[end_token] = f"L-{label}"
# Now distinguish the O cases from ones where we miss the tokenization
entity_chars = set()
for start_char, end_char, label in entities:
for i in range(start_char, end_char):
entity_chars.add(i)
for token in doc:
for i in range(token.idx, token.idx + len(token)):
if i in entity_chars:
break
else:
biluo[token.i] = missing
if "-" in biluo:
ent_str = str(entities)
warnings.warn(
Warnings.W030.format(
text=doc.text[:50] + "..." if len(doc.text) > 50 else doc.text,
entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
)
)
return biluo
def spans_from_biluo_tags(doc, tags):
"""Encode per-token tags following the BILUO scheme into Span object, e.g.
to overwrite the doc.ents.
doc (Doc): The document that the BILUO tags refer to.
entities (iterable): A sequence of BILUO tags with each tag describing one
token. Each tags string will be of the form of either "", "O" or
"{action}-{label}", where action is one of "B", "I", "L", "U".
RETURNS (list): A sequence of Span objects.
"""
token_offsets = tags_to_entities(tags)
spans = []
for label, start_idx, end_idx in token_offsets:
span = Span(doc, start_idx, end_idx + 1, label=label)
spans.append(span)
return spans
def offsets_from_biluo_tags(doc, tags):
"""Encode per-token tags following the BILUO scheme into entity offsets.
doc (Doc): The document that the BILUO tags refer to.
entities (iterable): A sequence of BILUO tags with each tag describing one
token. Each tags string will be of the form of either "", "O" or
"{action}-{label}", where action is one of "B", "I", "L", "U".
RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
`end` will be character-offset integers denoting the slice into the
original string.
"""
spans = spans_from_biluo_tags(doc, tags)
return [(span.start_char, span.end_char, span.label_) for span in spans]
def tags_to_entities(tags):
entities = []
start = None
for i, tag in enumerate(tags):
if tag is None:
continue
if tag.startswith("O"):
# TODO: We shouldn't be getting these malformed inputs. Fix this.
if start is not None:
start = None
continue
elif tag == "-":
continue
elif tag.startswith("I"):
if start is None:
raise ValueError(Errors.E067.format(tags=tags[: i + 1]))
continue
if tag.startswith("U"):
entities.append((tag[2:], i, i))
elif tag.startswith("B"):
start = i
elif tag.startswith("L"):
entities.append((tag[2:], start, i))
start = None
else:
raise ValueError(Errors.E068.format(tag=tag))
return entities

View File

@ -0,0 +1,8 @@
from ..tokens.doc cimport Doc
from .align cimport Alignment
cdef class NewExample:
cdef readonly Doc x
cdef readonly Doc y
cdef readonly Alignment _alignment

434
spacy/gold/new_example.pyx Normal file
View File

@ -0,0 +1,434 @@
import numpy
from ..tokens import Token
from ..tokens.doc cimport Doc
from ..attrs import IDS
from .align cimport Alignment
from .annotation import TokenAnnotation, DocAnnotation
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
from .align import Alignment
from ..errors import Errors, AlignmentError
cpdef Doc annotations2doc(Doc predicted, tok_annot, doc_annot):
# TODO: Improve and test this
words = tok_annot.get("ORTH", [tok.text for tok in predicted])
attrs, array = _annot2array(predicted.vocab, tok_annot, doc_annot)
output = Doc(predicted.vocab, words=words)
if array.size:
output = output.from_array(attrs, array)
output.cats.update(doc_annot.get("cats", {}))
return output
cdef class NewExample:
def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
""" Doc can either be text, or an actual Doc """
msg = "Example.__init__ got None for '{arg}'. Requires Doc."
if predicted is None:
raise TypeError(msg.format(arg="predicted"))
if reference is None:
raise TypeError(msg.format(arg="reference"))
self.x = predicted
self.y = reference
self._alignment = alignment
property predicted:
def __get__(self):
return self.x
def __set__(self, doc):
self.x = doc
property reference:
def __get__(self):
return self.y
def __set__(self, doc):
self.y = doc
@classmethod
def from_dict(cls, Doc predicted, dict example_dict):
if example_dict is None:
raise ValueError("Example.from_dict expected dict, received None")
if not isinstance(predicted, Doc):
raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}")
example_dict = _fix_legacy_dict_data(predicted, example_dict)
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
return NewExample(
predicted,
annotations2doc(predicted, tok_dict, doc_dict)
)
@property
def alignment(self):
if self._alignment is None:
if self.doc is None:
return None
spacy_words = [token.orth_ for token in self.predicted]
gold_words = [token.orth_ for token in self.reference]
if gold_words == []:
gold_words = spacy_words
self._alignment = Alignment(spacy_words, gold_words)
return self._alignment
def get_aligned(self, field):
"""Return an aligned array for a token attribute."""
# TODO: This is probably wrong. I just bashed this out and there's probably
# all sorts of edge-cases.
alignment = self.alignment
i2j_multi = alignment.i2j_multi
gold_to_cand = alignment.gold_to_cand
cand_to_gold = alignment.cand_to_gold
gold_values = self.reference.to_array([field])
output = []
for i, gold_i in enumerate(cand_to_gold):
if self.predicted[i].text.isspace():
output.append(None)
elif gold_i is None:
if i in i2j_multi:
output.append(gold_values[i2j_multi[i]])
else:
output.append(None)
else:
output.append(gold_values[gold_i])
return output
def to_dict(self):
return {
"doc_annotation": {
"cats": dict(self.reference.cats),
"links": [], # TODO
},
"token_annotation": {
"ids": [t.i+1 for t in self.reference],
"words": [t.text for t in self.reference],
"tags": [t.tag_ for t in self.reference],
"lemmas": [t.lemma_ for t in self.reference],
"pos": [t.pos_ for t in self.reference],
"morphs": [t.morph_ for t in self.reference],
"heads": [t.head.i for t in self.reference],
"deps": [t.dep_ for t in self.reference],
"sent_starts": [int(bool(t.is_sent_start)) for t in self.reference],
"entities": biluo_tags_from_doc(self.reference)
}
}
def split_sents(self):
""" Split the token annotations into multiple Examples based on
sent_starts and return a list of the new Examples"""
if not self.reference.is_sentenced:
return [self]
# TODO: Do this for misaligned somehow?
predicted_words = [t.text for t in self.predicted]
reference_words = [t.text for t in self.reference]
if predicted_words != reference_words:
raise NotImplementedError("TODO: Implement this")
# Implement the easy case.
output = []
cls = self.__class__
for sent in self.reference.sents:
# I guess for misaligned we just need to use the gold_to_cand?
output.append(
cls(
self.predicted[sent.start : sent.end + 1].as_doc(),
sent.as_doc()
)
)
return output
def text(self):
return self.x.text
def _annot2array(vocab, tok_annot, doc_annot):
attrs = []
values = []
for key, value in doc_annot.items():
if key == "entities":
words = tok_annot["ORTH"]
ent_iobs, ent_types = _parse_ner_tags(vocab, words, value)
tok_annot["ENT_IOB"] = ent_iobs
tok_annot["ENT_TYPE"] = ent_types
elif key == "links":
entities = doc_annot.get("entities", {})
if value and not entities:
raise ValueError(Errors.E984)
ent_kb_ids = _parse_links(vocab, words, value, entities)
tok_annot["ENT_KB_ID"] = ent_kb_ids
elif key == "cats":
pass
else:
raise ValueError(f"Unknown doc attribute: {key}")
for key, value in tok_annot.items():
if key not in IDS:
raise ValueError(f"Unknown token attribute: {key}")
elif key == "ORTH":
pass
elif key == "HEAD":
attrs.append(key)
values.append([h-i for i, h in enumerate(value)])
elif key == "SENT_START":
attrs.append(key)
values.append(value)
elif key == "MORPH":
attrs.append(key)
values.append([vocab.morphology.add(v) for v in value])
elif key == "ENT_IOB":
iob_strings = Token.iob_strings()
attrs.append(key)
try:
values.append([iob_strings.index(v) for v in value])
except ValueError:
raise ValueError(Errors.E985.format(values=iob_strings, value=values))
else:
attrs.append(key)
values.append([vocab.strings.add(v) for v in value])
array = numpy.asarray(values, dtype="uint64")
return attrs, array.T
def _parse_example_dict_data(example_dict):
return (
example_dict["token_annotation"],
example_dict["doc_annotation"]
)
def _fix_legacy_dict_data(predicted, example_dict):
token_dict = example_dict.get("token_annotation", {})
doc_dict = example_dict.get("doc_annotation", {})
for key, value in example_dict.items():
if key in ("token_annotation", "doc_annotation"):
pass
elif key == "ids":
pass
elif key in ("cats", "links") and value:
doc_dict[key] = value
elif key in ("ner", "entities") and value:
doc_dict["entities"] = value
else:
token_dict[key] = value
# Remap keys
remapping = {
"words": "ORTH",
"tags": "TAG",
"pos": "POS",
"lemmas": "LEMMA",
"deps": "DEP",
"heads": "HEAD",
"sent_starts": "SENT_START",
"morphs": "MORPH",
}
old_token_dict = token_dict
token_dict = {}
for key, value in old_token_dict.items():
if key in ("text", "ids", "entities", "ner", "brackets"):
pass
elif key in remapping:
token_dict[remapping[key]] = value
else:
raise ValueError(f"Unknown attr: {key}")
if "HEAD" in token_dict and "SENT_START" in token_dict:
# If heads are set, we don't also redundantly specify SENT_START.
token_dict.pop("SENT_START")
return {
"token_annotation": token_dict,
"doc_annotation": doc_dict
}
def _parse_ner_tags(vocab, words, biluo_or_offsets):
if isinstance(biluo_or_offsets[0], (list, tuple)):
# Convert to biluo if necessary
# This is annoying but to convert the offsets we need a Doc
# that has the target tokenization.
reference = Doc(vocab, words=words)
biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
else:
biluo = biluo_or_offsets
ent_iobs = []
ent_types = []
for iob_tag in biluo_to_iob(biluo):
ent_iobs.append(iob_tag.split("-")[0])
if iob_tag.startswith("I") or iob_tag.startswith("B"):
ent_types.append(iob_tag.split("-", 1)[1])
else:
ent_types.append("")
return ent_iobs, ent_types
def _parse_links(vocab, words, links, entities):
reference = Doc(vocab, words=words)
starts = {token.idx: token.i for token in reference}
ends = {token.idx + len(token): token.i for token in reference}
ent_kb_ids = ["" for _ in reference]
entity_map = [(ent[0], ent[1]) for ent in entities]
# links annotations need to refer 1-1 to entity annotations - throw error otherwise
for index, annot_dict in links.items():
start_char, end_char = index
if (start_char, end_char) not in entity_map:
raise ValueError(Errors.E984)
for index, annot_dict in links.items():
true_kb_ids = []
for key, value in annot_dict.items():
if value == 1.0:
true_kb_ids.append(key)
if len(true_kb_ids) > 1:
raise ValueError(Errors.E983)
if len(true_kb_ids) == 1:
start_char, end_char = index
start_token = starts.get(start_char)
end_token = ends.get(end_char)
for i in range(start_token, end_token+1):
ent_kb_ids[i] = true_kb_ids[0]
return ent_kb_ids
class Example:
def get_aligned(self, field):
"""Return an aligned array for a token annotation field."""
if self.doc is None:
return self.token_annotation.get_field(field)
doc = self.doc
if field == "word":
return [token.orth_ for token in doc]
gold_values = self.token_annotation.get_field(field)
alignment = self.alignment
i2j_multi = alignment.i2j_multi
gold_to_cand = alignment.gold_to_cand
cand_to_gold = alignment.cand_to_gold
output = []
for i, gold_i in enumerate(cand_to_gold):
if doc[i].text.isspace():
output.append(None)
elif gold_i is None:
if i in i2j_multi:
output.append(gold_values[i2j_multi[i]])
else:
output.append(None)
else:
output.append(gold_values[gold_i])
return output
def split_sents(self):
""" Split the token annotations into multiple Examples based on
sent_starts and return a list of the new Examples"""
if not self.token_annotation.words:
return [self]
s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
s_brackets = []
sent_start_i = 0
t = self.token_annotation
split_examples = []
for i in range(len(t.words)):
if i > 0 and t.sent_starts[i] == 1:
split_examples.append(
Example(
doc=Doc(self.doc.vocab, words=s_words),
token_annotation=TokenAnnotation(
ids=s_ids,
words=s_words,
tags=s_tags,
pos=s_pos,
morphs=s_morphs,
lemmas=s_lemmas,
heads=s_heads,
deps=s_deps,
entities=s_ents,
sent_starts=s_sent_starts,
brackets=s_brackets,
),
doc_annotation=self.doc_annotation
)
)
s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
s_sent_starts, s_brackets = [], []
sent_start_i = i
s_ids.append(t.get_id(i))
s_words.append(t.get_word(i))
s_tags.append(t.get_tag(i))
s_pos.append(t.get_pos(i))
s_morphs.append(t.get_morph(i))
s_lemmas.append(t.get_lemma(i))
s_heads.append(t.get_head(i) - sent_start_i)
s_deps.append(t.get_dep(i))
s_ents.append(t.get_entity(i))
s_sent_starts.append(t.get_sent_start(i))
for b_end, b_label in t.brackets_by_start.get(i, []):
s_brackets.append((i - sent_start_i, b_end - sent_start_i, b_label))
i += 1
split_examples.append(
Example(
doc=Doc(self.doc.vocab, words=s_words),
token_annotation=TokenAnnotation(
ids=s_ids,
words=s_words,
tags=s_tags,
pos=s_pos,
morphs=s_morphs,
lemmas=s_lemmas,
heads=s_heads,
deps=s_deps,
entities=s_ents,
sent_starts=s_sent_starts,
brackets=s_brackets,
),
doc_annotation=self.doc_annotation
)
)
return split_examples
@classmethod
def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
"""
Return a list of Example objects, from a variety of input formats.
make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
"""
if isinstance(examples, Example):
return [examples]
if isinstance(examples, tuple):
examples = [examples]
converted_examples = []
for ex in examples:
if isinstance(ex, Example):
converted_examples.append(ex)
# convert string to Doc to Example
elif isinstance(ex, str):
if keep_raw_text:
converted_examples.append(Example(doc=ex))
else:
doc = make_doc(ex)
converted_examples.append(Example(doc=doc))
# convert tuples to Example
elif isinstance(ex, tuple) and len(ex) == 2:
doc, gold = ex
# convert string to Doc
if isinstance(doc, str) and not keep_raw_text:
doc = make_doc(doc)
converted_examples.append(Example.from_dict(gold, doc=doc))
# convert Doc to Example
elif isinstance(ex, Doc):
converted_examples.append(Example(doc=ex))
else:
converted_examples.append(ex)
return converted_examples
def _deprecated_get_gold(self, make_projective=False):
from ..syntax.gold_parse import get_parses_from_example
_, gold = get_parses_from_example(self, make_projective=make_projective)[0]
return gold

View File

@ -636,6 +636,7 @@ class Language(object):
examples (iterable): `Example` objects.
YIELDS (tuple): `Example` objects.
"""
# TODO: This is deprecated right?
for name, proc in self.pipeline:
if hasattr(proc, "preprocess_gold"):
examples = proc.preprocess_gold(examples)
@ -722,24 +723,26 @@ class Language(object):
DOCS: https://spacy.io/api/language#evaluate
"""
examples = Example.to_example_objects(examples, make_doc=self.make_doc)
examples = Example.to_example_objects(examples)
if scorer is None:
scorer = Scorer(pipeline=self.pipeline)
if component_cfg is None:
component_cfg = {}
docs = (eg.predicted for eg in examples)
for name, pipe in self.pipeline:
kwargs = component_cfg.get(name, {})
kwargs.setdefault("batch_size", batch_size)
if not hasattr(pipe, "pipe"):
examples = _pipe(examples, pipe, kwargs)
docs = _pipe(docs, pipe, kwargs)
else:
examples = pipe.pipe(examples, as_example=True, **kwargs)
for ex in examples:
docs = pipe.pipe(docs, **kwargs)
for doc, eg in zip(docs, examples):
if verbose:
print(ex.doc)
eg.predicted = doc
kwargs = component_cfg.get("scorer", {})
kwargs.setdefault("verbose", verbose)
scorer.score(ex, **kwargs)
scorer.score(eg, **kwargs)
return scorer
@contextmanager

View File

@ -51,9 +51,9 @@ class Morphologizer(Tagger):
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
**kwargs):
for example in get_examples():
for i, morph in enumerate(example.token_annotation.morphs):
pos = example.token_annotation.get_pos(i)
morph = Morphology.feats_to_dict(morph)
for i, token in enumerate(example.reference):
pos = token.pos_
morph = token.morph
norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)]
if pos:
morph["POS"] = pos
@ -92,7 +92,7 @@ class Morphologizer(Tagger):
guesses = scores.argmax(axis=1)
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
for ex in examples:
gold = ex.gold
gold = ex._deprecated_get_gold()
for i in range(len(gold.morphs)):
pos = gold.pos[i] if i < len(gold.pos) else ""
morph = gold.morphs[i]

View File

@ -20,7 +20,7 @@ from .defaults import default_nel, default_senter
from .functions import merge_subtokens
from ..language import Language, component
from ..syntax import nonproj
from ..gold import Example
from ..gold.new_example import NewExample as Example
from ..attrs import POS, ID
from ..util import link_vectors_to_models, create_default_optimizer
from ..parts_of_speech import X
@ -48,55 +48,38 @@ class Pipe(object):
def from_nlp(cls, nlp, model, **cfg):
return cls(nlp.vocab, model, **cfg)
def _get_doc(self, example):
""" Use this method if the `example` can be both a Doc or an Example """
if isinstance(example, Doc):
return example
return example.doc
def __init__(self, vocab, model, **cfg):
"""Create a new pipe instance."""
raise NotImplementedError
def __call__(self, example):
def __call__(self, Doc doc):
"""Apply the pipe to one document. The document is
modified in-place, and returned.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
doc = self._get_doc(example)
predictions = self.predict([doc])
if isinstance(predictions, tuple) and len(predictions) == 2:
scores, tensors = predictions
self.set_annotations([doc], scores, tensors=tensors)
else:
self.set_annotations([doc], predictions)
if isinstance(example, Example):
example.doc = doc
return example
return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
def pipe(self, stream, batch_size=128, n_threads=-1):
"""Apply the pipe to a stream of documents.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
for examples in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
for docs in util.minibatch(stream, size=batch_size):
predictions = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2:
scores, tensors = predictions
self.set_annotations(docs, scores, tensors=tensors)
else:
self.set_annotations(docs, predictions)
if as_example:
for ex, doc in zip(examples, docs):
ex.doc = doc
yield ex
else:
yield from docs
def predict(self, docs):
@ -109,14 +92,13 @@ class Pipe(object):
"""Modify a batch of documents, using pre-computed scores."""
raise NotImplementedError
def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
def update(self, docs, set_annotations=False, drop=0.0, sgd=None, losses=None):
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model.
Delegates to predict() and get_loss().
"""
if set_annotations:
docs = (self._get_doc(ex) for ex in examples)
docs = list(self.pipe(docs))
def rehearse(self, examples, sgd=None, losses=None, **config):
@ -255,28 +237,15 @@ class Tagger(Pipe):
def labels(self):
return tuple(self.vocab.morphology.tag_names)
def __call__(self, example):
doc = self._get_doc(example)
def __call__(self, doc):
tags = self.predict([doc])
self.set_annotations([doc], tags)
if isinstance(example, Example):
example.doc = doc
return example
return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in util.minibatch(stream, size=batch_size):
tag_ids = self.predict(docs)
assert len(docs) == len(examples)
assert len(tag_ids) == len(examples)
self.set_annotations(docs, tag_ids)
if as_example:
for ex, doc in zip(examples, docs):
ex.doc = doc
yield ex
else:
yield from docs
def predict(self, docs):
@ -327,15 +296,17 @@ class Tagger(Pipe):
doc.is_tagged = True
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
examples = Example.to_example_objects(examples)
for eg in examples:
assert isinstance(eg, Example)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
return
set_dropout_rate(self.model, drop)
tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
tag_scores, bp_tag_scores = self.model.begin_update(
[eg.predicted for eg in examples])
for sc in tag_scores:
if self.model.ops.xp.isnan(sc.sum()):
raise ValueError("nan value in scores")
@ -347,17 +318,16 @@ class Tagger(Pipe):
if losses is not None:
losses[self.name] += loss
if set_annotations:
docs = [ex.doc for ex in examples]
docs = [eg.predicted for eg in examples]
self.set_annotations(docs, self._scores2guesses(tag_scores))
def rehearse(self, examples, drop=0., sgd=None, losses=None):
"""Perform a 'rehearsal' update, where we try to match the output of
an initial model.
"""
docs = [eg.predicted for eg in examples]
if self._rehearsal_model is None:
return
examples = Example.to_example_objects(examples)
docs = [ex.doc for ex in examples]
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return
@ -373,7 +343,7 @@ class Tagger(Pipe):
def get_loss(self, examples, scores):
loss_func = SequenceCategoricalCrossentropy(names=self.labels)
truths = [eg.gold.tags for eg in examples]
truths = [eg.get_aligned("tag") for eg in examples]
d_scores, loss = loss_func(scores, truths)
if self.model.ops.xp.isnan(loss):
raise ValueError("nan value when computing loss")
@ -387,7 +357,8 @@ class Tagger(Pipe):
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = {}
for example in get_examples():
for tag in example.token_annotation.tags:
for token in example.y:
tag = token.tag_
if tag in orig_tag_map:
new_tag_map[tag] = orig_tag_map[tag]
else:
@ -560,9 +531,9 @@ class SentenceRecognizer(Tagger):
correct = numpy.zeros((scores.shape[0],), dtype="i")
guesses = scores.argmax(axis=1)
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
for ex in examples:
gold = ex.gold
for sent_start in gold.sent_starts:
for eg in examples:
sent_starts = eg.get_aligned("sent_start")
for sent_start in sent_starts:
if sent_start is None:
correct[idx] = guesses[idx]
elif sent_start in tag_index:
@ -575,7 +546,7 @@ class SentenceRecognizer(Tagger):
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum()
docs = [ex.doc for ex in examples]
docs = [eg.predicted for eg in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
@ -686,8 +657,8 @@ class MultitaskObjective(Tagger):
gold_examples = nonproj.preprocess_training_data(get_examples())
# for raw_text, doc_annot in gold_tuples:
for example in gold_examples:
for i in range(len(example.token_annotation.ids)):
label = self.make_label(i, example.token_annotation)
for token in example.y:
label = self.make_label(token)
if label is not None and label not in self.labels:
self.labels[label] = len(self.labels)
self.model.initialize()
@ -705,13 +676,13 @@ class MultitaskObjective(Tagger):
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype="i")
guesses = scores.argmax(axis=1)
golds = [ex.gold for ex in examples]
docs = [ex.doc for ex in examples]
for i, gold in enumerate(golds):
for j in range(len(docs[i])):
# Handels alignment for tokenization differences
token_annotation = gold.get_token_annotation()
label = self.make_label(j, token_annotation)
docs = [eg.predicted for eg in examples]
for i, eg in enumerate(examples):
# Handles alignment for tokenization differences
doc_annots = eg.get_aligned()
for j in range(len(eg.predicted)):
tok_annots = {key: values[j] for key, values in tok_annots.items()}
label = self.make_label(j, tok_annots)
if label is None or label not in self.labels:
correct[idx] = guesses[idx]
else:
@ -723,83 +694,49 @@ class MultitaskObjective(Tagger):
return float(loss), d_scores
@staticmethod
def make_dep(i, token_annotation):
if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
return None
return token_annotation.deps[i]
def make_dep(token):
return token.dep_
@staticmethod
def make_tag(i, token_annotation):
return token_annotation.tags[i]
def make_tag(token):
return token.tag_
@staticmethod
def make_ent(i, token_annotation):
if token_annotation.entities is None:
return None
return token_annotation.entities[i]
def make_ent(token):
if token.ent_iob_ == "O":
return "O"
else:
return token.ent_iob_ + "-" + token.ent_type_
@staticmethod
def make_dep_tag_offset(i, token_annotation):
if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
return None
offset = token_annotation.heads[i] - i
def make_dep_tag_offset(token):
dep = token.dep_
tag = token.tag_
offset = token.head.i - token.i
offset = min(offset, 2)
offset = max(offset, -2)
return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}"
return f"{dep}-{tag}:{offset}"
@staticmethod
def make_ent_tag(i, token_annotation):
if token_annotation.entities is None or token_annotation.entities[i] is None:
return None
def make_ent_tag(token):
if token.ent_iob_ == "O":
ent = "O"
else:
return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}"
ent = token.ent_iob_ + "-" + token.ent_type_
tag = token.tag_
return f"{tag}-{ent}"
@staticmethod
def make_sent_start(target, token_annotation, cache=True, _cache={}):
def make_sent_start(token):
"""A multi-task objective for representing sentence boundaries,
using BILU scheme. (O is impossible)
The implementation of this method uses an internal cache that relies
on the identity of the heads array, to avoid requiring a new piece
of gold data. You can pass cache=False if you know the cache will
do the wrong thing.
"""
words = token_annotation.words
heads = token_annotation.heads
assert len(words) == len(heads)
assert target < len(words), (target, len(words))
if cache:
if id(heads) in _cache:
return _cache[id(heads)][target]
if token.is_sent_start and token.is_sent_end:
return "U-SENT"
elif token.is_sent_start:
return "B-SENT"
else:
for key in list(_cache.keys()):
_cache.pop(key)
sent_tags = ["I-SENT"] * len(words)
_cache[id(heads)] = sent_tags
else:
sent_tags = ["I-SENT"] * len(words)
def _find_root(child):
seen = set([child])
while child is not None and heads[child] != child:
seen.add(child)
child = heads[child]
return child
sentences = {}
for i in range(len(words)):
root = _find_root(i)
if root is None:
sent_tags[i] = None
else:
sentences.setdefault(root, []).append(i)
for root, span in sorted(sentences.items()):
if len(span) == 1:
sent_tags[span[0]] = "U-SENT"
else:
sent_tags[span[0]] = "B-SENT"
sent_tags[span[-1]] = "L-SENT"
return sent_tags[target]
return "I-SENT"
class ClozeMultitask(Pipe):
@ -832,7 +769,7 @@ class ClozeMultitask(Pipe):
# token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens,
# and look them up all at once. This prevents data copying.
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
target = vectors[ids]
gradient = self.distance.get_grad(prediction, target)
loss = self.distance.get_loss(prediction, target)
@ -842,11 +779,12 @@ class ClozeMultitask(Pipe):
pass
def rehearse(self, examples, drop=0., sgd=None, losses=None):
examples = Example.to_example_objects(examples)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs = [eg.predicted for eg in examples]
set_dropout_rate(self.model, drop)
predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples])
predictions, bp_predictions = self.model.begin_update(
[eg.predicted for eg in examples])
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
bp_predictions(d_predictions)
if sgd is not None:
@ -881,17 +819,10 @@ class TextCategorizer(Pipe):
def labels(self, value):
self.cfg["labels"] = tuple(value)
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in util.minibatch(stream, size=batch_size):
scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors)
if as_example:
for ex, doc in zip(examples, docs):
ex.doc = doc
yield ex
else:
yield from docs
def predict(self, docs):
@ -913,12 +844,15 @@ class TextCategorizer(Pipe):
doc.cats[label] = float(scores[i, j])
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
examples = Example.to_example_objects(examples)
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
for eg in examples:
assert isinstance(eg, Example)
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
return
set_dropout_rate(self.model, drop)
scores, bp_scores = self.model.begin_update([ex.doc for ex in examples])
scores, bp_scores = self.model.begin_update(
[eg.predicted for eg in examples]
)
loss, d_scores = self.get_loss(examples, scores)
bp_scores(d_scores)
if sgd is not None:
@ -927,14 +861,15 @@ class TextCategorizer(Pipe):
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
if set_annotations:
docs = [ex.doc for ex in examples]
docs = [eg.predicted for eg in examples]
self.set_annotations(docs, scores=scores)
def rehearse(self, examples, drop=0., sgd=None, losses=None):
if self._rehearsal_model is None:
return
examples = Example.to_example_objects(examples)
docs=[ex.doc for ex in examples]
for eg in examples:
assert isinstance(eg, Example)
docs = [eg.predicted for eg in examples]
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return
@ -950,13 +885,12 @@ class TextCategorizer(Pipe):
losses[self.name] += (gradient**2).sum()
def _examples_to_truth(self, examples):
gold_cats = [ex.doc_annotation.cats for ex in examples]
truths = numpy.zeros((len(gold_cats), len(self.labels)), dtype="f")
not_missing = numpy.ones((len(gold_cats), len(self.labels)), dtype="f")
for i, gold_cat in enumerate(gold_cats):
truths = numpy.zeros((len(examples), len(self.labels)), dtype="f")
not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
for i, eg in enumerate(examples):
for j, label in enumerate(self.labels):
if label in gold_cat:
truths[i, j] = gold_cat[label]
if label in eg.predicted.cats:
truths[i, j] = eg.reference.cats[label]
else:
not_missing[i, j] = 0.
truths = self.model.ops.asarray(truths)
@ -993,7 +927,7 @@ class TextCategorizer(Pipe):
# TODO: begin_training is not guaranteed to see all data / labels ?
examples = list(get_examples())
for example in examples:
for cat in example.doc_annotation.cats:
for cat in example.y.cats:
self.add_label(cat)
self.require_labels()
docs = [Doc(Vocab(), words=["hello"])]
@ -1150,21 +1084,22 @@ class EntityLinker(Pipe):
losses.setdefault(self.name, 0.0)
if not examples:
return 0
examples = Example.to_example_objects(examples)
for eg in examples:
assert isinstance(eg, Example)
sentence_docs = []
docs = [ex.doc for ex in examples]
docs = [eg.predicted for eg in examples]
if set_annotations:
# This seems simpler than other ways to get that exact output -- but
# it does run the model twice :(
predictions = self.model.predict(docs)
golds = [ex.gold for ex in examples]
for doc, gold in zip(docs, golds):
for eg in examples:
doc = eg.predicted
ents_by_offset = dict()
for ent in doc.ents:
ents_by_offset[(ent.start_char, ent.end_char)] = ent
for entity, kb_dict in gold.links.items():
links = self._get_links_from_doc(eg.reference)
for entity, kb_dict in links.items():
if isinstance(entity, str):
entity = literal_eval(entity)
start, end = entity
@ -1185,7 +1120,10 @@ class EntityLinker(Pipe):
raise RuntimeError(Errors.E030)
set_dropout_rate(self.model, drop)
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds)
loss, d_scores = self.get_similarity_loss(
scores=sentence_encodings,
examples=examples
)
bp_context(d_scores)
if sgd is not None:
self.model.finish_update(sgd)
@ -1196,10 +1134,11 @@ class EntityLinker(Pipe):
self.set_annotations(docs, predictions)
return loss
def get_similarity_loss(self, golds, scores):
def get_similarity_loss(self, examples, scores):
entity_encodings = []
for gold in golds:
for entity, kb_dict in gold.links.items():
for eg in examples:
links = self._get_links_from_doc(eg.reference)
for entity, kb_dict in links.items():
for kb_id, value in kb_dict.items():
# this loss function assumes we're only using positive examples
if value:
@ -1218,8 +1157,9 @@ class EntityLinker(Pipe):
def get_loss(self, examples, scores):
cats = []
for ex in examples:
for entity, kb_dict in ex.gold.links.items():
for eg in examples:
links = self._get_links_from_doc(eg.reference)
for entity, kb_dict in links.items():
for kb_id, value in kb_dict.items():
cats.append([value])
@ -1232,26 +1172,18 @@ class EntityLinker(Pipe):
loss = loss / len(cats)
return loss, d_scores
def __call__(self, example):
doc = self._get_doc(example)
def _get_links_from_doc(self, doc):
return {}
def __call__(self, doc):
kb_ids, tensors = self.predict([doc])
self.set_annotations([doc], kb_ids, tensors=tensors)
if isinstance(example, Example):
example.doc = doc
return example
return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in util.minibatch(stream, size=batch_size):
kb_ids, tensors = self.predict(docs)
self.set_annotations(docs, kb_ids, tensors=tensors)
if as_example:
for ex, doc in zip(examples, docs):
ex.doc = doc
yield ex
else:
yield from docs
def predict(self, docs):
@ -1428,7 +1360,7 @@ class Sentencizer(Pipe):
):
pass
def __call__(self, example):
def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
example (Doc or Example): The document to process.
@ -1436,7 +1368,6 @@ class Sentencizer(Pipe):
DOCS: https://spacy.io/api/sentencizer#call
"""
doc = self._get_doc(example)
start = 0
seen_period = False
for i, token in enumerate(doc):
@ -1450,25 +1381,16 @@ class Sentencizer(Pipe):
seen_period = True
if start < len(doc):
doc[start].is_sent_start = True
if isinstance(example, Example):
example.doc = doc
return example
return doc
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
for examples in util.minibatch(stream, size=batch_size):
docs = [self._get_doc(ex) for ex in examples]
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in util.minibatch(stream, size=batch_size):
predictions = self.predict(docs)
if isinstance(predictions, tuple) and len(tuple) == 2:
scores, tensors = predictions
self.set_annotations(docs, scores, tensors=tensors)
else:
self.set_annotations(docs, predictions)
if as_example:
for ex, doc in zip(examples, docs):
ex.doc = doc
yield ex
else:
yield from docs
def predict(self, docs):

View File

@ -286,7 +286,7 @@ class Scorer(object):
if isinstance(example, tuple) and len(example) == 2:
doc, gold = example
else:
gold = example.gold
gold = example._deprecated_get_gold()
doc = example.doc
if len(doc) != len(gold):

View File

@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
from .stateclass cimport StateClass
from ..typedefs cimport weight_t, attr_t
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParseC
from .gold_parse cimport GoldParseC
cdef class ArcEager(TransitionSystem):

View File

@ -0,0 +1,39 @@
from cymem.cymem cimport Pool
from .transition_system cimport Transition
from ..typedefs cimport attr_t
cdef struct GoldParseC:
int* tags
int* heads
int* has_dep
int* sent_start
attr_t* labels
int** brackets
Transition* ner
cdef class GoldParse:
cdef Pool mem
cdef GoldParseC c
cdef readonly object orig
cdef int length
cdef public int loss
cdef public list words
cdef public list tags
cdef public list pos
cdef public list morphs
cdef public list lemmas
cdef public list sent_starts
cdef public list heads
cdef public list labels
cdef public dict orths
cdef public list ner
cdef public dict brackets
cdef public dict cats
cdef public dict links
cdef readonly list cand_to_gold
cdef readonly list gold_to_cand

346
spacy/syntax/gold_parse.pyx Normal file
View File

@ -0,0 +1,346 @@
# cython: profile=True
import re
import random
import numpy
import tempfile
import shutil
import itertools
from pathlib import Path
import srsly
import warnings
from .. import util
from . import nonproj
from ..tokens import Doc, Span
from ..errors import Errors, AlignmentError, Warnings
from ..gold.annotation import TokenAnnotation
from ..gold.iob_utils import offsets_from_biluo_tags, biluo_tags_from_offsets
from ..gold.align import align
punct_re = re.compile(r"\W")
def is_punct_label(label):
return label == "P" or label.lower() == "punct"
def get_parses_from_example(
example, merge=True, vocab=None, make_projective=True, ignore_misaligned=False
):
"""Return a list of (doc, GoldParse) objects.
If merge is set to True, keep all Token annotations as one big list."""
# merge == do not modify Example
if merge:
examples = [example]
else:
# not merging: one GoldParse per sentence, defining docs with the words
# from each sentence
examples = example.split_sents()
outputs = []
for eg in examples:
eg_dict = eg.to_dict()
try:
gp = GoldParse.from_annotation(
eg.predicted,
eg_dict["doc_annotation"],
eg_dict["token_annotation"],
make_projective=make_projective
)
except AlignmentError:
if ignore_misaligned:
gp = None
else:
raise
outputs.append((eg.predicted, gp))
return outputs
cdef class GoldParse:
"""Collection for training annotations.
DOCS: https://spacy.io/api/goldparse
"""
@classmethod
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
return cls(
doc,
words=token_annotation["words"],
tags=token_annotation["tags"],
pos=token_annotation["pos"],
morphs=token_annotation["morphs"],
lemmas=token_annotation["lemmas"],
heads=token_annotation["heads"],
deps=token_annotation["deps"],
entities=token_annotation["entities"],
sent_starts=token_annotation["sent_starts"],
cats=doc_annotation["cats"],
links=doc_annotation["links"],
make_projective=make_projective
)
def get_token_annotation(self):
ids = None
if self.words:
ids = list(range(len(self.words)))
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
pos=self.pos, morphs=self.morphs,
lemmas=self.lemmas, heads=self.heads,
deps=self.labels, entities=self.ner,
sent_starts=self.sent_starts)
def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
lemmas=None, heads=None, deps=None, entities=None,
sent_starts=None, make_projective=False, cats=None,
links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations.
pos (iterable): A sequence of strings, representing UPOS annotations.
morphs (iterable): A sequence of strings, representing morph
annotations.
lemmas (iterable): A sequence of strings, representing lemma
annotations.
heads (iterable): A sequence of integers, representing syntactic
head offsets.
deps (iterable): A sequence of strings, representing the syntactic
relation types.
entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions.
sent_starts (iterable): A sequence of sentence position tags, 1 for
the first word in a sentence, 0 for all others.
cats (dict): Labels for text classification. Each key in the dictionary
may be a string or an int, or a `(start_char, end_char, label)`
tuple, indicating that the label is applied to only part of the
document (usually a sentence). Unlike entity annotations, label
annotations can overlap, i.e. a single word can be covered by
multiple labelled spans. The TextCategorizer component expects
true examples of a label to have the value 1.0, and negative
examples of a label to have the value 0.0. Labels not in the
dictionary are treated as missing - the gradient for those labels
will be zero.
links (dict): A dict with `(start_char, end_char)` keys,
and the values being dicts with kb_id:value entries,
representing the external IDs in a knowledge base (KB)
mapped to either 1.0 or 0.0, indicating positive and
negative examples respectively.
RETURNS (GoldParse): The newly constructed object.
"""
self.mem = Pool()
self.loss = 0
self.length = len(doc)
self.cats = {} if cats is None else dict(cats)
self.links = {} if links is None else dict(links)
# temporary doc for aligning entity annotation
entdoc = None
# avoid allocating memory if the doc does not contain any tokens
if self.length == 0:
self.words = []
self.tags = []
self.heads = []
self.labels = []
self.ner = []
self.morphs = []
# set a minimal orig so that the scorer can score an empty doc
self.orig = TokenAnnotation(ids=[])
else:
if not words:
words = [token.text for token in doc]
if not tags:
tags = [None for _ in words]
if not pos:
pos = [None for _ in words]
if not morphs:
morphs = [None for _ in words]
if not lemmas:
lemmas = [None for _ in words]
if not heads:
heads = [None for _ in words]
if not deps:
deps = [None for _ in words]
if not sent_starts:
sent_starts = [None for _ in words]
if entities is None:
entities = ["-" for _ in words]
elif len(entities) == 0:
entities = ["O" for _ in words]
else:
# Translate the None values to '-', to make processing easier.
# See Issue #2603
entities = [(ent if ent is not None else "-") for ent in entities]
if not isinstance(entities[0], str):
# Assume we have entities specified by character offset.
# Create a temporary Doc corresponding to provided words
# (to preserve gold tokenization) and text (to preserve
# character offsets).
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
entdoc_entities = biluo_tags_from_offsets(entdoc, entities)
# There may be some additional whitespace tokens in the
# temporary doc, so check that the annotations align with
# the provided words while building a list of BILUO labels.
entities = []
words_offset = 0
for i in range(len(entdoc_words)):
if words[i + words_offset] == entdoc_words[i]:
entities.append(entdoc_entities[i])
else:
words_offset -= 1
if len(entities) != len(words):
warnings.warn(Warnings.W029.format(text=doc.text))
entities = ["-" for _ in words]
# These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.words = [None] * len(doc)
self.tags = [None] * len(doc)
self.pos = [None] * len(doc)
self.morphs = [None] * len(doc)
self.lemmas = [None] * len(doc)
self.heads = [None] * len(doc)
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
self.sent_starts = [None] * len(doc)
# This needs to be done before we align the words
if make_projective and any(heads) and any(deps) :
heads, deps = nonproj.projectivize(heads, deps)
# Do many-to-one alignment for misaligned tokens.
# If we over-segment, we'll have one gold word that covers a sequence
# of predicted words
# If we under-segment, we'll have one predicted word that covers a
# sequence of gold words.
# If we "mis-segment", we'll have a sequence of predicted words covering
# a sequence of gold words. That's many-to-many -- we don't do that
# except for NER spans where the start and end can be aligned.
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
self.orig = TokenAnnotation(ids=list(range(len(words))),
words=words, tags=tags, pos=pos, morphs=morphs,
lemmas=lemmas, heads=heads, deps=deps, entities=entities,
sent_starts=sent_starts, brackets=[])
for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace():
self.words[i] = doc[i].text
self.tags[i] = "_SP"
self.pos[i] = "SPACE"
self.morphs[i] = None
self.lemmas[i] = None
self.heads[i] = None
self.labels[i] = None
self.ner[i] = None
self.sent_starts[i] = 0
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
self.pos[i] = pos[i2j_multi[i]]
self.morphs[i] = morphs[i2j_multi[i]]
self.lemmas[i] = lemmas[i2j_multi[i]]
self.sent_starts[i] = sent_starts[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
# Set next word in multi-token span as head, until last
if not is_last:
self.heads[i] = i+1
self.labels[i] = "subtok"
else:
head_i = heads[i2j_multi[i]]
if head_i:
self.heads[i] = self.gold_to_cand[head_i]
self.labels[i] = deps[i2j_multi[i]]
ner_tag = entities[i2j_multi[i]]
# Assign O/- for many-to-one O/- NER tags
if ner_tag in ("O", "-"):
self.ner[i] = ner_tag
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
self.pos[i] = pos[gold_i]
self.morphs[i] = morphs[gold_i]
self.lemmas[i] = lemmas[gold_i]
self.sent_starts[i] = sent_starts[gold_i]
if heads[gold_i] is None:
self.heads[i] = None
else:
self.heads[i] = self.gold_to_cand[heads[gold_i]]
self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i]
# Assign O/- for one-to-many O/- NER tags
for j, cand_j in enumerate(self.gold_to_cand):
if cand_j is None:
if j in j2i_multi:
i = j2i_multi[j]
ner_tag = entities[j]
if ner_tag in ("O", "-"):
self.ner[i] = ner_tag
# If there is entity annotation and some tokens remain unaligned,
# align all entities at the character level to account for all
# possible token misalignments within the entity spans
if any([e not in ("O", "-") for e in entities]) and None in self.ner:
# If the temporary entdoc wasn't created above, initialize it
if not entdoc:
entdoc_words, entdoc_spaces = util.get_words_and_spaces(words, doc.text)
entdoc = Doc(doc.vocab, words=entdoc_words, spaces=entdoc_spaces)
# Get offsets based on gold words and BILUO entities
entdoc_offsets = offsets_from_biluo_tags(entdoc, entities)
aligned_offsets = []
aligned_spans = []
# Filter offsets to identify those that align with doc tokens
for offset in entdoc_offsets:
span = doc.char_span(offset[0], offset[1])
if span and not span.text.isspace():
aligned_offsets.append(offset)
aligned_spans.append(span)
# Convert back to BILUO for doc tokens and assign NER for all
# aligned spans
biluo_tags = biluo_tags_from_offsets(doc, aligned_offsets, missing=None)
for span in aligned_spans:
for i in range(span.start, span.end):
self.ner[i] = biluo_tags[i]
# Prevent whitespace that isn't within entities from being tagged as
# an entity.
for i in range(len(self.ner)):
if self.tags[i] == "_SP":
prev_ner = self.ner[i-1] if i >= 1 else None
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
if prev_ner == "O" or next_ner == "O":
self.ner[i] = "O"
cycle = nonproj.contains_cycle(self.heads)
if cycle is not None:
raise ValueError(Errors.E069.format(cycle=cycle,
cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
doc_tokens=" ".join(words[:50])))
def __len__(self):
"""Get the number of gold-standard tokens.
RETURNS (int): The number of gold-standard tokens.
"""
return self.length
@property
def is_projective(self):
"""Whether the provided syntactic annotations form a projective
dependency tree.
"""
return not nonproj.is_nonproj_tree(self.heads)

View File

@ -515,8 +515,8 @@ cdef class Parser:
good_golds = []
good_states = []
for i, eg in enumerate(whole_examples):
doc = eg.doc
gold = self.moves.preprocess_gold(eg.gold)
parses = get_parses_from_example(eg)
doc, gold = parses[0]
if gold is not None and self.moves.has_gold(gold):
good_docs.append(doc)
good_golds.append(gold)
@ -535,8 +535,12 @@ cdef class Parser:
cdef:
StateClass state
Transition action
whole_docs = [ex.doc for ex in whole_examples]
whole_golds = [ex.gold for ex in whole_examples]
whole_docs = []
whole_golds = []
for eg in whole_examples:
for doc, gold in get_parses_from_example(eg):
whole_docs.append(doc)
whole_golds.append(gold)
whole_states = self.moves.init_batch(whole_docs)
max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
max_moves = 0
@ -625,7 +629,7 @@ cdef class Parser:
doc_sample = []
gold_sample = []
for example in islice(get_examples(), 10):
parses = example.get_gold_parses(merge=False, vocab=self.vocab)
parses = get_parses_from_example(example, merge=False, vocab=self.vocab)
for doc, gold in parses:
if len(doc):
doc_sample.append(doc)

View File

@ -7,7 +7,7 @@ from copy import copy
from ..tokens.doc cimport Doc, set_children_from_heads
from ..gold import Example
from ..gold import Example, TokenAnnotation
from ..errors import Errors
@ -108,7 +108,7 @@ def preprocess_training_data(gold_data, label_freq_cutoff=30):
proj_token_dict = example.token_annotation.to_dict()
proj_token_dict["heads"] = proj_heads
proj_token_dict["deps"] = deco_deps
new_example.set_token_annotation(**proj_token_dict)
new_example.token_annotation = TokenAnnotation(**proj_token_dict)
preprocessed.append(new_example)
if label_freq_cutoff > 0:
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
@ -216,6 +216,6 @@ def _filter_labels(examples, cutoff, freqs):
filtered_labels.append(label)
filtered_token_dict = example.token_annotation.to_dict()
filtered_token_dict["deps"] = filtered_labels
new_example.set_token_annotation(**filtered_token_dict)
new_example.token_annotation = TokenAnnotation(**filtered_token_dict)
filtered.append(new_example)
return filtered

View File

@ -35,7 +35,10 @@ def _train_parser(parser):
for i in range(5):
losses = {}
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
gold = {
"heads": [1, 1, 3, 3],
"deps": ["left", "ROOT", "left", "ROOT"]
}
parser.update((doc, gold), sgd=sgd, losses=losses)
return parser
@ -47,9 +50,10 @@ def test_add_label(parser):
for i in range(100):
losses = {}
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = GoldParse(
doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"]
)
gold = {
"heads": [1, 1, 3, 3],
"deps": ["right", "ROOT", "left", "ROOT"]
}
parser.update((doc, gold), sgd=sgd, losses=losses)
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc = parser(doc)

View File

@ -47,7 +47,7 @@ def doc(vocab):
@pytest.fixture
def gold(doc):
return GoldParse(doc, heads=[1, 1, 1], deps=["L", "ROOT", "R"])
return {"heads": [1, 1, 1], "deps": ["L", "ROOT", "R"]}
def test_can_init_nn_parser(parser):

View File

@ -1,7 +1,6 @@
import pytest
from thinc.api import Adam
from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab
from spacy.pipeline.defaults import default_parser
@ -28,7 +27,7 @@ def parser(vocab):
for i in range(10):
losses = {}
doc = Doc(vocab, words=["a", "b", "c", "d"])
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
gold = dict(heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
parser.update((doc, gold), sgd=sgd, losses=losses)
return parser

View File

@ -3,7 +3,7 @@ import gc
import numpy
import copy
from spacy.gold import Example
from spacy.gold import Example, TokenAnnotation
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.lex_attrs import is_stop
@ -272,9 +272,16 @@ def test_issue1963(en_tokenizer):
def test_issue1967(label):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner = EntityRecognizer(Vocab(), default_ner(), **config)
example = Example(doc=None)
example.set_token_annotation(
ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label]
example = Example(
doc=Doc(ner.vocab, words=["word"]),
token_annotation=TokenAnnotation(
ids=[0],
words=["word"],
tags=["tag"],
heads=[0],
deps=["dep"],
entities=[label]
)
)
ner.moves.get_actions(gold_parses=[example])

View File

@ -1,9 +1,12 @@
from spacy.errors import AlignmentError
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo, align
from spacy.gold import GoldCorpus, docs_to_json, Example, DocAnnotation
from spacy.gold import spans_from_biluo_tags, iob_to_biluo, align
from spacy.gold import GoldCorpus, docs_to_json, DocAnnotation
from spacy.gold.new_example import NewExample as Example
from spacy.lang.en import English
from spacy.syntax.nonproj import is_nonproj_tree
from spacy.syntax.gold_parse import GoldParse, get_parses_from_example
from spacy.syntax.gold_parse import get_parses_from_example
from spacy.tokens import Doc
from spacy.util import get_words_and_spaces, compounding, minibatch
import pytest
@ -90,10 +93,16 @@ def merged_dict():
"ids": [1, 2, 3, 4, 5, 6, 7],
"words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
"tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
"sent_starts": [1, 0, 0, 1, 0, 0, 0, 0],
"sent_starts": [1, 0, 0, 1, 0, 0, 0],
}
@pytest.fixture
def vocab():
nlp = English()
return nlp.vocab
def test_gold_biluo_U(en_vocab):
words = ["I", "flew", "to", "London", "."]
spaces = [True, True, True, False, True]
@ -270,88 +279,38 @@ def test_roundtrip_docs_to_json(doc):
srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
reloaded_example = next(goldcorpus.dev_dataset(nlp))
goldparse = reloaded_example.gold
reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
assert len(doc) == goldcorpus.count_train()
assert text == reloaded_example.text
assert tags == goldparse.tags
assert pos == goldparse.pos
assert morphs == goldparse.morphs
assert lemmas == goldparse.lemmas
assert deps == goldparse.labels
assert heads == goldparse.heads
assert biluo_tags == goldparse.ner
assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
assert cats["BAKING"] == goldparse.cats["BAKING"]
# roundtrip to JSONL train dicts
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "roundtrip.jsonl"
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
reloaded_example = next(goldcorpus.dev_dataset(nlp))
goldparse = reloaded_example.gold
assert len(doc) == goldcorpus.count_train()
assert text == reloaded_example.text
assert tags == goldparse.tags
assert pos == goldparse.pos
assert morphs == goldparse.morphs
assert lemmas == goldparse.lemmas
assert deps == goldparse.labels
assert heads == goldparse.heads
assert biluo_tags == goldparse.ner
assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
assert cats["BAKING"] == goldparse.cats["BAKING"]
# roundtrip to JSONL tuples
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "roundtrip.jsonl"
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
# load and rewrite as JSONL tuples
srsly.write_jsonl(jsonl_file, goldcorpus.train_examples)
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
reloaded_example = next(goldcorpus.dev_dataset(nlp))
goldparse = reloaded_example.gold
assert len(doc) == goldcorpus.count_train()
assert text == reloaded_example.text
assert tags == goldparse.tags
assert deps == goldparse.labels
assert heads == goldparse.heads
assert lemmas == goldparse.lemmas
assert biluo_tags == goldparse.ner
assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
assert cats["BAKING"] == goldparse.cats["BAKING"]
assert text == reloaded_example.predicted.text
assert tags == [t.tag_ for t in reloaded_example.reference]
assert pos == [t.pos_ for t in reloaded_example.reference]
assert morphs == [t.morph_ for t in reloaded_example.reference]
assert lemmas == [t.lemma_ for t in reloaded_example.reference]
assert deps == [t.dep_ for t in reloaded_example.reference]
assert heads == [t.head.i for t in reloaded_example.reference]
assert "TRAVEL" in reloaded_example.reference.cats
assert "BAKING" in reloaded_example.reference.cats
assert cats["TRAVEL"] == reloaded_example.reference.cats["TRAVEL"]
assert cats["BAKING"] == reloaded_example.reference.cats["BAKING"]
@pytest.mark.xfail # TODO do we need to do the projectivity differently?
def test_projective_train_vs_nonprojective_dev(doc):
nlp = English()
deps = [t.dep_ for t in doc]
heads = [t.head.i for t in doc]
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
json_file = tmpdir / "test.json"
# write to JSON train dicts
srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(json_file), str(json_file))
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
train_goldparse = train_reloaded_example.gold
train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
dev_reloaded_example = next(goldcorpus.dev_dataset(nlp))
dev_goldparse = dev_reloaded_example.gold
dev_goldparse = get_parses_from_example(dev_reloaded_example)[0][1]
assert is_nonproj_tree([t.head.i for t in doc]) is True
assert is_nonproj_tree(train_goldparse.heads) is False
@ -364,27 +323,31 @@ def test_projective_train_vs_nonprojective_dev(doc):
assert deps == dev_goldparse.labels
# Hm, not sure where misalignment check would be handled? In the components too?
# I guess that does make sense. A text categorizer doesn't care if it's
# misaligned...
@pytest.mark.xfail # TODO
def test_ignore_misaligned(doc):
nlp = English()
text = doc.text
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
json_file = tmpdir / "test.json"
data = [docs_to_json(doc)]
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, data)
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
# write to JSON train dicts
srsly.write_json(json_file, data)
goldcorpus = GoldCorpus(str(json_file), str(json_file))
with pytest.raises(AlignmentError):
train_reloaded_example = next(goldcorpus.train_dataset(nlp))
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
json_file = tmpdir / "test.json"
data = [docs_to_json(doc)]
data[0]["paragraphs"][0]["raw"] = text.replace("Sarah", "Jane")
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, data)
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
# write to JSON train dicts
srsly.write_json(json_file, data)
goldcorpus = GoldCorpus(str(json_file), str(json_file))
# doesn't raise an AlignmentError, but there is nothing to iterate over
# because the only example can't be aligned
@ -395,14 +358,14 @@ def test_ignore_misaligned(doc):
def test_make_orth_variants(doc):
nlp = English()
with make_tempdir() as tmpdir:
jsonl_file = tmpdir / "test.jsonl"
# write to JSONL train dicts
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
json_file = tmpdir / "test.json"
# write to JSON train dicts
srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(json_file), str(json_file))
# due to randomness, test only that this runs with no errors for now
train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
train_goldparse = train_reloaded_example.gold # noqa: F841
train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
@pytest.mark.parametrize(
@ -456,20 +419,6 @@ def test_gold_constructor():
assert gold.words == ["This", "is", "a", "sentence"]
def test_gold_orig_annot():
nlp = English()
doc = nlp("This is a sentence")
gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0})
assert gold.orig.words == ["This", "is", "a", "sentence"]
assert gold.cats["cat1"]
doc_annotation = DocAnnotation(cats={"cat1": 0.0, "cat2": 1.0})
gold2 = GoldParse.from_annotation(doc, doc_annotation, gold.orig)
assert gold2.orig.words == ["This", "is", "a", "sentence"]
assert not gold2.cats["cat1"]
def test_tuple_format_implicit():
"""Test tuple format with implicit GoldParse creation"""
@ -485,6 +434,7 @@ def test_tuple_format_implicit():
_train(train_data)
@pytest.mark.xfail # TODO
def test_tuple_format_implicit_invalid():
"""Test that an error is thrown for an implicit invalid GoldParse field"""
@ -518,43 +468,51 @@ def _train(train_data):
def test_split_sents(merged_dict):
nlp = English()
example = Example()
example.set_token_annotation(**merged_dict)
assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1
example = Example.from_dict(
Doc(nlp.vocab, words=merged_dict["words"]),
merged_dict
)
assert len(get_parses_from_example(
example,
merge=False,
vocab=nlp.vocab,
make_projective=False)
) == 2
assert len(get_parses_from_example(
example,
merge=True,
vocab=nlp.vocab,
make_projective=False
)) == 1
split_examples = example.split_sents()
assert len(split_examples) == 2
token_annotation_1 = split_examples[0].token_annotation
assert token_annotation_1.ids == [1, 2, 3]
assert token_annotation_1.words == ["Hi", "there", "everyone"]
assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
assert token_annotation_1.sent_starts == [1, 0, 0]
token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
assert token_annotation_1["words"] == ["Hi", "there", "everyone"]
assert token_annotation_1["tags"] == ["INTJ", "ADV", "PRON"]
assert token_annotation_1["sent_starts"] == [1, 0, 0]
token_annotation_2 = split_examples[1].token_annotation
assert token_annotation_2.ids == [4, 5, 6, 7]
assert token_annotation_2.words == ["It", "is", "just", "me"]
assert token_annotation_2.tags == ["PRON", "AUX", "ADV", "PRON"]
assert token_annotation_2.sent_starts == [1, 0, 0, 0]
token_annotation_2 = split_examples[1].to_dict()["token_annotation"]
assert token_annotation_2["words"] == ["It", "is", "just", "me"]
assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"]
assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]
def test_tuples_to_example(merged_dict):
ex = Example()
ex.set_token_annotation(**merged_dict)
# This fails on some None value? Need to look into that.
@pytest.mark.xfail # TODO
def test_tuples_to_example(vocab, merged_dict):
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
ex.set_doc_annotation(cats=cats)
ex_dict = ex.to_dict()
assert ex_dict["token_annotation"]["ids"] == merged_dict["ids"]
assert ex_dict["token_annotation"]["words"] == merged_dict["words"]
assert ex_dict["token_annotation"]["tags"] == merged_dict["tags"]
assert ex_dict["token_annotation"]["sent_starts"] == merged_dict["sent_starts"]
assert ex_dict["doc_annotation"]["cats"] == cats
def test_empty_example_goldparse():
nlp = English()
doc = nlp("")
example = Example(doc=doc)
assert len(example.get_gold_parses()) == 1
merged_dict = dict(merged_dict)
merged_dict["cats"] = cats
ex = Example.from_dict(
Doc(vocab, words=merged_dict["words"]),
merged_dict
)
words = [token.text for token in ex.reference]
assert words == merged_dict["words"]
tags = [token.tag_ for token in ex.reference]
assert tags == merged_dict["tags"]
sent_starts = [token.is_sent_start for token in ex.reference]
assert sent_starts == [bool(v) for v in merged_dict["sent_starts"]]
ex.reference.cats == cats

View File

@ -19,22 +19,16 @@ def nlp():
return nlp
@pytest.mark.xfail # TODO
def test_language_update(nlp):
text = "hello world"
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
wrongkeyannots = {"LABEL": True}
doc = Doc(nlp.vocab, words=text.split(" "))
gold = GoldParse(doc, **annots)
# Update with doc and gold objects
nlp.update((doc, gold))
# Update with text and dict
nlp.update((text, annots))
# Update with doc object and dict
nlp.update((doc, annots))
# Update with text and gold object
nlp.update((text, gold))
# Update with empty doc and gold object
nlp.update((None, gold))
# Update badly
with pytest.raises(ValueError):
nlp.update((doc, None))
@ -44,20 +38,16 @@ def test_language_update(nlp):
def test_language_evaluate(nlp):
text = "hello world"
annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
annots = {
"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
}
doc = Doc(nlp.vocab, words=text.split(" "))
gold = GoldParse(doc, **annots)
# Evaluate with doc and gold objects
nlp.evaluate([(doc, gold)])
# Evaluate with text and dict
nlp.evaluate([(text, annots)])
# Evaluate with doc object and dict
nlp.evaluate([(doc, annots)])
# Evaluate with text and gold object
nlp.evaluate([(text, gold)])
# Evaluate badly
with pytest.raises(Exception):
nlp.evaluate([text, gold])
nlp.evaluate([text, annots])
def test_evaluate_no_pipe(nlp):

View File

@ -0,0 +1,186 @@
import pytest
from spacy.gold.new_example import NewExample as Example
from spacy.tokens import Doc
from spacy.vocab import Vocab
def test_Example_init_requires_doc_objects():
vocab = Vocab()
with pytest.raises(TypeError):
eg = Example(None, None)
with pytest.raises(TypeError):
eg = Example(Doc(vocab, words=["hi"]), None)
with pytest.raises(TypeError):
eg = Example(None, Doc(vocab, words=["hi"]))
def test_Example_from_dict_basic():
eg = Example.from_dict(
Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]}
)
assert isinstance(eg.x, Doc)
assert isinstance(eg.y, Doc)
@pytest.mark.parametrize(
"annots", [{"words": ["ice", "cream"], "weirdannots": ["something", "such"]}]
)
def test_Example_from_dict_invalid(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
with pytest.raises(ValueError):
Example.from_dict(predicted, annots)
@pytest.mark.parametrize("annots", [{"words": ["ice", "cream"], "tags": ["NN", "NN"]}])
def test_Example_from_dict_with_tags(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
eg = Example.from_dict(predicted, annots)
for i, token in enumerate(eg.reference):
assert token.tag_ == annots["tags"][i]
@pytest.mark.parametrize(
"annots",
[
{
"words": ["I", "like", "London", "and", "Berlin", "."],
"deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
"heads": [1, 1, 1, 2, 2, 1],
}
],
)
def test_Example_from_dict_with_parse(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
eg = Example.from_dict(predicted, annots)
for i, token in enumerate(eg.reference):
assert token.dep_ == annots["deps"][i]
assert token.head.i == annots["heads"][i]
@pytest.mark.parametrize(
"annots",
[
{
"words": ["Sarah", "'s", "sister", "flew"],
"morphs": [
"NounType=prop|Number=sing",
"Poss=yes",
"Number=sing",
"Tense=past|VerbForm=fin",
],
}
],
)
def test_Example_from_dict_with_morphology(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
eg = Example.from_dict(predicted, annots)
for i, token in enumerate(eg.reference):
assert token.morph_ == annots["morphs"][i]
@pytest.mark.parametrize(
"annots",
[
{
"words": ["This", "is", "one", "sentence", "this", "is", "another"],
"sent_starts": [1, 0, 0, 0, 1, 0, 0],
}
],
)
def test_Example_from_dict_with_sent_start(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
eg = Example.from_dict(predicted, annots)
assert len(list(eg.reference.sents)) == 2
for i, token in enumerate(eg.reference):
assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
@pytest.mark.parametrize(
"annots",
[
{
"words": ["This", "is", "a", "sentence"],
"cats": {"cat1": 1.0, "cat2": 0.0, "cat3": 0.5},
}
],
)
def test_Example_from_dict_with_cats(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
eg = Example.from_dict(predicted, annots)
assert len(list(eg.reference.cats)) == 3
assert eg.reference.cats["cat1"] == 1.0
assert eg.reference.cats["cat2"] == 0.0
assert eg.reference.cats["cat3"] == 0.5
@pytest.mark.parametrize(
"annots",
[
{
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
}
],
)
def test_Example_from_dict_with_entities(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
eg = Example.from_dict(predicted, annots)
assert len(list(eg.reference.ents)) == 2
assert eg.reference[0].ent_iob_ == "O"
assert eg.reference[1].ent_iob_ == "O"
assert eg.reference[2].ent_iob_ == "B"
assert eg.reference[3].ent_iob_ == "I"
assert eg.reference[4].ent_iob_ == "O"
assert eg.reference[5].ent_iob_ == "B"
assert eg.reference[6].ent_iob_ == "O"
assert eg.reference[2].ent_type_ == "LOC"
assert eg.reference[3].ent_type_ == "LOC"
assert eg.reference[5].ent_type_ == "LOC"
@pytest.mark.parametrize(
"annots",
[
{
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
"links": {(7, 15): {"Q60": 1.0, "Q64": 0.0}, (20, 26): {"Q60": 0.0, "Q64": 1.0}},
}
],
)
def test_Example_from_dict_with_links(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
eg = Example.from_dict(predicted, annots)
assert eg.reference[0].ent_kb_id_ == ""
assert eg.reference[1].ent_kb_id_ == ""
assert eg.reference[2].ent_kb_id_ == "Q60"
assert eg.reference[3].ent_kb_id_ == "Q60"
assert eg.reference[4].ent_kb_id_ == ""
assert eg.reference[5].ent_kb_id_ == "Q64"
assert eg.reference[6].ent_kb_id_ == ""
@pytest.mark.parametrize(
"annots",
[
{
"words": ["I", "like", "New", "York", "and", "Berlin", "."],
"entities": [(7, 15, "LOC"), (20, 26, "LOC")],
"links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
}
],
)
def test_Example_from_dict_with_links_invalid(annots):
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
with pytest.raises(ValueError):
Example.from_dict(predicted, annots)

View File

@ -1,12 +1,14 @@
from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest
from pytest import approx
from spacy.gold import Example, GoldParse
from spacy.gold import Example, GoldParse, TokenAnnotation
from spacy.gold.iob_utils import biluo_tags_from_offsets
from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve
from .util import get_doc
from spacy.lang.en import English
test_las_apple = [
[
"Apple is looking at buying U.K. startup for $ 1 billion",
@ -134,8 +136,11 @@ def test_ner_per_type(en_vocab):
words=input_.split(" "),
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
)
ex = Example(doc=doc)
ex.set_token_annotation(entities=annot["entities"])
entities = biluo_tags_from_offsets(doc, annot["entities"])
ex = Example(
doc=doc,
token_annotation=TokenAnnotation(entities=entities)
)
scorer.score(ex)
results = scorer.scores
@ -155,8 +160,11 @@ def test_ner_per_type(en_vocab):
words=input_.split(" "),
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
)
ex = Example(doc=doc)
ex.set_token_annotation(entities=annot["entities"])
entities = biluo_tags_from_offsets(doc, annot["entities"])
ex = Example(
doc=doc,
token_annotation=TokenAnnotation(entities=entities)
)
scorer.score(ex)
results = scorer.scores

View File

@ -799,6 +799,8 @@ cdef class Doc:
cdef attr_id_t attr_id
cdef TokenC* tokens = self.c
cdef int length = len(array)
if length != len(self):
raise ValueError("Cannot set array values longer than the document.")
# Get set up for fast loading
cdef Pool mem = Pool()
cdef int n_attrs = len(attrs)
@ -823,6 +825,13 @@ cdef class Doc:
for i in range(length):
if array[i, col] != 0:
self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
# Verify ENT_IOB are proper integers
if ENT_IOB in attrs:
iob_strings = Token.iob_strings()
col = attrs.index(ENT_IOB)
for i in range(length):
if array[i, col] not in range(0, len(iob_strings)):
raise ValueError(Errors.E985.format(values=iob_strings, value=array[i, col]))
# Now load the data
for i in range(length):
token = &self.c[i]
@ -881,6 +890,32 @@ cdef class Doc:
def to_bytes(self, exclude=tuple(), **kwargs):
"""Serialize, i.e. export the document contents to a binary string.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations.
DOCS: https://spacy.io/api/doc#to_bytes
"""
return srsly.msgpack_dumps(self.to_dict(exclude=exclude, **kwargs))
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
"""Deserialize, i.e. import the document contents from a binary string.
data (bytes): The string to load from.
exclude (list): String names of serialization fields to exclude.
RETURNS (Doc): Itself.
DOCS: https://spacy.io/api/doc#from_bytes
"""
return self.from_dict(
srsly.msgpack_loads(bytes_data),
exclude=exclude,
**kwargs
)
def to_dict(self, exclude=tuple(), **kwargs):
"""Export the document contents to a dictionary for serialization.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations.
@ -917,9 +952,9 @@ cdef class Doc:
serializers["user_data_keys"] = lambda: srsly.msgpack_dumps(user_data_keys)
if "user_data_values" not in exclude:
serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
return util.to_bytes(serializers, exclude)
return util.to_dict(serializers, exclude)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
def from_dict(self, msg, exclude=tuple(), **kwargs):
"""Deserialize, i.e. import the document contents from a binary string.
data (bytes): The string to load from.
@ -943,7 +978,6 @@ cdef class Doc:
for key in kwargs:
if key in deserializers or key in ("user_data",):
raise ValueError(Errors.E128.format(arg=key))
msg = util.from_bytes(bytes_data, deserializers, exclude)
# Msgpack doesn't distinguish between lists and tuples, which is
# vexing for user data. As a best guess, we *know* that within
# keys, we must have tuples. In values we just have to hope
@ -975,6 +1009,7 @@ cdef class Doc:
self.from_array(msg["array_head"][2:], attrs[:, 2:])
return self
def extend_tensor(self, tensor):
"""Concatenate a new tensor onto the doc.tensor object.

View File

@ -778,6 +778,10 @@ cdef class Token:
"""
return self.c.ent_iob
@classmethod
def iob_strings(cls):
return ("", "I", "O", "B")
@property
def ent_iob_(self):
"""IOB code of named entity tag. "B" means the token begins an entity,
@ -787,8 +791,7 @@ cdef class Token:
RETURNS (str): IOB code of named entity tag.
"""
iob_strings = ("", "I", "O", "B")
return iob_strings[self.c.ent_iob]
return self.iob_strings()[self.c.ent_iob]
property ent_id:
"""RETURNS (uint64): ID of the entity the token is an instance of,

View File

@ -819,16 +819,23 @@ def filter_spans(spans):
def to_bytes(getters, exclude):
return srsly.msgpack_dumps(to_dict(getters, exclude))
def from_bytes(bytes_data, setters, exclude):
return from_dict(srsly.msgpack_loads(bytes_data), setters, exclude)
def to_dict(getters, exclude):
serialized = {}
for key, getter in getters.items():
# Split to support file names like meta.json
if key.split(".")[0] not in exclude:
serialized[key] = getter()
return srsly.msgpack_dumps(serialized)
return serialized
def from_bytes(bytes_data, setters, exclude):
msg = srsly.msgpack_loads(bytes_data)
def from_dict(msg, setters, exclude):
for key, setter in setters.items():
# Split to support file names like meta.json
if key.split(".")[0] not in exclude and key in msg: