2017-03-15 17:29:42 +03:00
|
|
|
# cython: profile=True
|
2015-05-24 22:50:48 +03:00
|
|
|
import re
|
2017-05-21 17:06:17 +03:00
|
|
|
import random
|
2018-03-27 20:23:02 +03:00
|
|
|
import numpy
|
|
|
|
import tempfile
|
|
|
|
import shutil
|
2019-08-28 10:14:20 +03:00
|
|
|
import itertools
|
2018-03-27 20:23:02 +03:00
|
|
|
from pathlib import Path
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
import srsly
|
2020-02-28 14:20:23 +03:00
|
|
|
import warnings
|
2018-03-27 20:23:02 +03:00
|
|
|
|
2016-03-01 12:09:08 +03:00
|
|
|
from .syntax import nonproj
|
2019-02-06 13:50:26 +03:00
|
|
|
from .tokens import Doc, Span
|
2020-02-28 14:20:23 +03:00
|
|
|
from .errors import Errors, AlignmentError, Warnings
|
2017-10-27 22:07:59 +03:00
|
|
|
from . import util
|
2015-02-21 19:06:58 +03:00
|
|
|
|
2018-11-28 20:04:58 +03:00
|
|
|
|
2019-03-08 13:42:26 +03:00
|
|
|
punct_re = re.compile(r"\W")
|
|
|
|
|
|
|
|
|
2015-06-08 01:54:13 +03:00
|
|
|
def tags_to_entities(tags):
|
|
|
|
entities = []
|
|
|
|
start = None
|
|
|
|
for i, tag in enumerate(tags):
|
2016-11-25 17:57:59 +03:00
|
|
|
if tag is None:
|
|
|
|
continue
|
2019-03-08 13:42:26 +03:00
|
|
|
if tag.startswith("O"):
|
2015-06-08 01:54:13 +03:00
|
|
|
# TODO: We shouldn't be getting these malformed inputs. Fix this.
|
|
|
|
if start is not None:
|
|
|
|
start = None
|
|
|
|
continue
|
2019-03-08 13:42:26 +03:00
|
|
|
elif tag == "-":
|
2015-06-08 01:54:13 +03:00
|
|
|
continue
|
2019-03-08 13:42:26 +03:00
|
|
|
elif tag.startswith("I"):
|
2018-04-03 16:50:31 +03:00
|
|
|
if start is None:
|
2019-03-08 13:42:26 +03:00
|
|
|
raise ValueError(Errors.E067.format(tags=tags[:i + 1]))
|
2015-06-08 01:54:13 +03:00
|
|
|
continue
|
2019-03-08 13:42:26 +03:00
|
|
|
if tag.startswith("U"):
|
2015-06-08 01:54:13 +03:00
|
|
|
entities.append((tag[2:], i, i))
|
2019-03-08 13:42:26 +03:00
|
|
|
elif tag.startswith("B"):
|
2015-06-08 01:54:13 +03:00
|
|
|
start = i
|
2019-03-08 13:42:26 +03:00
|
|
|
elif tag.startswith("L"):
|
2015-06-08 01:54:13 +03:00
|
|
|
entities.append((tag[2:], start, i))
|
|
|
|
start = None
|
|
|
|
else:
|
2018-04-03 16:50:31 +03:00
|
|
|
raise ValueError(Errors.E068.format(tag=tag))
|
2015-06-08 01:54:13 +03:00
|
|
|
return entities
|
|
|
|
|
|
|
|
|
2019-10-28 13:40:12 +03:00
|
|
|
def _normalize_for_alignment(tokens):
|
2019-10-27 15:47:08 +03:00
|
|
|
tokens = [w.replace(" ", "").lower() for w in tokens]
|
2019-10-28 13:40:12 +03:00
|
|
|
output = []
|
|
|
|
for token in tokens:
|
|
|
|
token = token.replace(" ", "").lower()
|
|
|
|
output.append(token)
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
2019-07-17 15:29:52 +03:00
|
|
|
def align(tokens_a, tokens_b):
|
2019-10-27 15:38:04 +03:00
|
|
|
"""Calculate alignment tables between two tokenizations.
|
2019-07-17 14:59:17 +03:00
|
|
|
|
2019-07-17 15:29:52 +03:00
|
|
|
tokens_a (List[str]): The candidate tokenization.
|
|
|
|
tokens_b (List[str]): The reference tokenization.
|
2019-07-17 14:59:17 +03:00
|
|
|
RETURNS: (tuple): A 5-tuple consisting of the following information:
|
|
|
|
* cost (int): The number of misaligned tokens.
|
2019-07-17 15:29:52 +03:00
|
|
|
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
|
|
|
|
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
|
|
|
|
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
|
|
|
|
it has the value -1.
|
2019-07-17 14:59:17 +03:00
|
|
|
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
|
2019-07-17 15:29:52 +03:00
|
|
|
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
|
|
|
|
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
|
|
|
|
the same token of `tokens_b`.
|
2019-07-17 14:59:17 +03:00
|
|
|
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
|
|
|
|
direction.
|
|
|
|
"""
|
2019-10-28 13:40:12 +03:00
|
|
|
tokens_a = _normalize_for_alignment(tokens_a)
|
|
|
|
tokens_b = _normalize_for_alignment(tokens_b)
|
2019-10-27 15:38:04 +03:00
|
|
|
cost = 0
|
|
|
|
a2b = numpy.empty(len(tokens_a), dtype="i")
|
|
|
|
b2a = numpy.empty(len(tokens_b), dtype="i")
|
2019-11-13 23:22:18 +03:00
|
|
|
a2b.fill(-1)
|
|
|
|
b2a.fill(-1)
|
2019-10-27 15:38:04 +03:00
|
|
|
a2b_multi = {}
|
|
|
|
b2a_multi = {}
|
|
|
|
i = 0
|
|
|
|
j = 0
|
|
|
|
offset_a = 0
|
|
|
|
offset_b = 0
|
|
|
|
while i < len(tokens_a) and j < len(tokens_b):
|
|
|
|
a = tokens_a[i][offset_a:]
|
|
|
|
b = tokens_b[j][offset_b:]
|
|
|
|
if a == b:
|
|
|
|
if offset_a == offset_b == 0:
|
|
|
|
a2b[i] = j
|
|
|
|
b2a[j] = i
|
|
|
|
elif offset_a == 0:
|
|
|
|
cost += 2
|
|
|
|
a2b_multi[i] = j
|
|
|
|
elif offset_b == 0:
|
|
|
|
cost += 2
|
|
|
|
b2a_multi[j] = i
|
|
|
|
offset_a = offset_b = 0
|
|
|
|
i += 1
|
|
|
|
j += 1
|
2019-10-28 17:44:28 +03:00
|
|
|
elif a == "":
|
|
|
|
assert offset_a == 0
|
|
|
|
cost += 1
|
|
|
|
i += 1
|
|
|
|
elif b == "":
|
|
|
|
assert offset_b == 0
|
|
|
|
cost += 1
|
|
|
|
j += 1
|
2019-10-27 15:38:04 +03:00
|
|
|
elif b.startswith(a):
|
|
|
|
cost += 1
|
|
|
|
if offset_a == 0:
|
|
|
|
a2b_multi[i] = j
|
|
|
|
i += 1
|
|
|
|
offset_a = 0
|
|
|
|
offset_b += len(a)
|
|
|
|
elif a.startswith(b):
|
|
|
|
cost += 1
|
|
|
|
if offset_b == 0:
|
|
|
|
b2a_multi[j] = i
|
|
|
|
j += 1
|
|
|
|
offset_b = 0
|
|
|
|
offset_a += len(b)
|
|
|
|
else:
|
|
|
|
assert "".join(tokens_a) != "".join(tokens_b)
|
2019-10-28 14:36:23 +03:00
|
|
|
raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
|
2019-10-27 15:38:04 +03:00
|
|
|
return cost, a2b, b2a, a2b_multi, b2a_multi
|
2015-05-24 22:50:48 +03:00
|
|
|
|
2015-05-27 20:13:11 +03:00
|
|
|
|
2017-05-21 17:06:17 +03:00
|
|
|
class GoldCorpus(object):
|
2017-05-22 13:29:30 +03:00
|
|
|
"""An annotated corpus, using the JSON file format. Manages
|
2019-03-08 13:42:26 +03:00
|
|
|
annotations for tagging, dependency parsing and NER.
|
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/goldcorpus
|
|
|
|
"""
|
2018-03-27 20:23:02 +03:00
|
|
|
def __init__(self, train, dev, gold_preproc=False, limit=None):
|
2017-05-22 13:29:30 +03:00
|
|
|
"""Create a GoldCorpus.
|
|
|
|
|
2019-11-11 19:35:27 +03:00
|
|
|
train (unicode or Path): File or directory of training data.
|
|
|
|
dev (unicode or Path): File or directory of development data.
|
2017-10-27 18:02:55 +03:00
|
|
|
RETURNS (GoldCorpus): The newly created object.
|
2017-05-22 13:29:30 +03:00
|
|
|
"""
|
2017-05-22 18:40:46 +03:00
|
|
|
self.limit = limit
|
2018-03-27 20:23:02 +03:00
|
|
|
if isinstance(train, str) or isinstance(train, Path):
|
2019-11-11 19:35:27 +03:00
|
|
|
train = self.read_examples(self.walk_corpus(train))
|
|
|
|
dev = self.read_examples(self.walk_corpus(dev))
|
2019-03-08 13:42:26 +03:00
|
|
|
# Write temp directory with one doc per file, so we can shuffle and stream
|
2018-03-27 20:23:02 +03:00
|
|
|
self.tmp_dir = Path(tempfile.mkdtemp())
|
2019-03-08 13:42:26 +03:00
|
|
|
self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
|
|
|
|
self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
|
2018-03-27 20:23:02 +03:00
|
|
|
|
|
|
|
def __del__(self):
|
2019-12-22 03:53:56 +03:00
|
|
|
shutil.rmtree(self.tmp_dir)
|
2018-03-27 20:23:02 +03:00
|
|
|
|
|
|
|
@staticmethod
|
2019-11-11 19:35:27 +03:00
|
|
|
def write_msgpack(directory, examples, limit=0):
|
2018-03-27 20:23:02 +03:00
|
|
|
if not directory.exists():
|
|
|
|
directory.mkdir()
|
2018-11-28 20:04:58 +03:00
|
|
|
n = 0
|
2019-11-11 19:35:27 +03:00
|
|
|
for i, example in enumerate(examples):
|
|
|
|
ex_dict = example.to_dict()
|
|
|
|
text = example.text
|
2019-12-22 03:53:56 +03:00
|
|
|
srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict))
|
2019-11-25 18:03:28 +03:00
|
|
|
n += 1
|
2018-11-28 20:04:58 +03:00
|
|
|
if limit and n >= limit:
|
|
|
|
break
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
@staticmethod
|
|
|
|
def walk_corpus(path):
|
|
|
|
path = util.ensure_path(path)
|
|
|
|
if not path.is_dir():
|
|
|
|
return [path]
|
|
|
|
paths = [path]
|
|
|
|
locs = []
|
|
|
|
seen = set()
|
|
|
|
for path in paths:
|
|
|
|
if str(path) in seen:
|
|
|
|
continue
|
|
|
|
seen.add(str(path))
|
2019-03-08 13:42:26 +03:00
|
|
|
if path.parts[-1].startswith("."):
|
2018-03-27 20:23:02 +03:00
|
|
|
continue
|
|
|
|
elif path.is_dir():
|
|
|
|
paths.extend(path.iterdir())
|
2019-08-02 10:58:51 +03:00
|
|
|
elif path.parts[-1].endswith((".json", ".jsonl")):
|
2018-03-27 20:23:02 +03:00
|
|
|
locs.append(path)
|
|
|
|
return locs
|
|
|
|
|
|
|
|
@staticmethod
|
2019-11-11 19:35:27 +03:00
|
|
|
def read_examples(locs, limit=0):
|
|
|
|
""" Yield training examples """
|
2017-05-22 18:40:46 +03:00
|
|
|
i = 0
|
2018-03-27 20:23:02 +03:00
|
|
|
for loc in locs:
|
|
|
|
loc = util.ensure_path(loc)
|
2020-01-01 15:16:48 +03:00
|
|
|
file_name = loc.parts[-1]
|
|
|
|
if file_name.endswith("json"):
|
2019-11-11 19:35:27 +03:00
|
|
|
examples = read_json_file(loc)
|
2020-01-01 15:16:48 +03:00
|
|
|
elif file_name.endswith("jsonl"):
|
2019-03-09 01:15:23 +03:00
|
|
|
gold_tuples = srsly.read_jsonl(loc)
|
2019-10-23 17:01:44 +03:00
|
|
|
first_gold_tuple = next(gold_tuples)
|
|
|
|
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
|
|
|
|
# TODO: proper format checks with schemas
|
|
|
|
if isinstance(first_gold_tuple, dict):
|
2019-11-11 19:35:27 +03:00
|
|
|
if first_gold_tuple.get("paragraphs", None):
|
|
|
|
examples = read_json_object(gold_tuples)
|
|
|
|
elif first_gold_tuple.get("doc_annotation", None):
|
|
|
|
examples = []
|
|
|
|
for ex_dict in gold_tuples:
|
|
|
|
doc = ex_dict.get("doc", None)
|
|
|
|
if doc is None:
|
|
|
|
doc = ex_dict.get("text", None)
|
2020-05-20 12:41:12 +03:00
|
|
|
if not (doc is None or isinstance(doc, Doc) or isinstance(doc, str)):
|
|
|
|
raise ValueError(Errors.E987.format(type=type(doc)))
|
2019-11-11 19:35:27 +03:00
|
|
|
examples.append(Example.from_dict(ex_dict, doc=doc))
|
|
|
|
|
2020-01-01 15:16:48 +03:00
|
|
|
elif file_name.endswith("msg"):
|
2019-11-11 19:35:27 +03:00
|
|
|
text, ex_dict = srsly.read_msgpack(loc)
|
|
|
|
examples = [Example.from_dict(ex_dict, doc=text)]
|
2018-03-27 20:23:02 +03:00
|
|
|
else:
|
2019-03-09 01:15:23 +03:00
|
|
|
supported = ("json", "jsonl", "msg")
|
2019-12-22 03:53:56 +03:00
|
|
|
raise ValueError(Errors.E124.format(path=loc, formats=supported))
|
2020-01-01 15:16:48 +03:00
|
|
|
try:
|
|
|
|
for example in examples:
|
|
|
|
yield example
|
|
|
|
i += 1
|
|
|
|
if limit and i >= limit:
|
|
|
|
return
|
|
|
|
except KeyError as e:
|
|
|
|
msg = "Missing key {}".format(e)
|
|
|
|
raise KeyError(Errors.E996.format(file=file_name, msg=msg))
|
|
|
|
except UnboundLocalError as e:
|
|
|
|
msg = "Unexpected document structure"
|
|
|
|
raise ValueError(Errors.E996.format(file=file_name, msg=msg))
|
2017-05-21 17:06:17 +03:00
|
|
|
|
|
|
|
@property
|
2019-11-11 19:35:27 +03:00
|
|
|
def dev_examples(self):
|
2019-03-08 13:42:26 +03:00
|
|
|
locs = (self.tmp_dir / "dev").iterdir()
|
2019-11-11 19:35:27 +03:00
|
|
|
yield from self.read_examples(locs, limit=self.limit)
|
2018-11-30 22:16:14 +03:00
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
@property
|
2019-11-11 19:35:27 +03:00
|
|
|
def train_examples(self):
|
2019-03-08 13:42:26 +03:00
|
|
|
locs = (self.tmp_dir / "train").iterdir()
|
2019-11-11 19:35:27 +03:00
|
|
|
yield from self.read_examples(locs, limit=self.limit)
|
2017-05-21 17:06:17 +03:00
|
|
|
|
|
|
|
def count_train(self):
|
2019-11-25 18:03:28 +03:00
|
|
|
"""Returns count of words in train examples"""
|
2017-05-21 17:06:17 +03:00
|
|
|
n = 0
|
2017-06-05 04:18:20 +03:00
|
|
|
i = 0
|
2019-11-11 19:35:27 +03:00
|
|
|
for example in self.train_examples:
|
2019-11-25 18:03:28 +03:00
|
|
|
n += len(example.token_annotation.words)
|
|
|
|
if self.limit and i >= self.limit:
|
|
|
|
break
|
|
|
|
i += 1
|
2017-05-21 17:06:17 +03:00
|
|
|
return n
|
|
|
|
|
2019-11-11 19:35:27 +03:00
|
|
|
def train_dataset(self, nlp, gold_preproc=False, max_length=None,
|
2019-10-28 13:40:12 +03:00
|
|
|
noise_level=0.0, orth_variant_level=0.0,
|
|
|
|
ignore_misaligned=False):
|
2018-03-27 20:23:02 +03:00
|
|
|
locs = list((self.tmp_dir / 'train').iterdir())
|
|
|
|
random.shuffle(locs)
|
2019-11-11 19:35:27 +03:00
|
|
|
train_examples = self.read_examples(locs, limit=self.limit)
|
|
|
|
gold_examples = self.iter_gold_docs(nlp, train_examples, gold_preproc,
|
2017-06-05 04:16:57 +03:00
|
|
|
max_length=max_length,
|
2018-03-27 20:23:02 +03:00
|
|
|
noise_level=noise_level,
|
2019-08-28 10:14:20 +03:00
|
|
|
orth_variant_level=orth_variant_level,
|
2019-10-28 13:40:12 +03:00
|
|
|
make_projective=True,
|
|
|
|
ignore_misaligned=ignore_misaligned)
|
2019-11-11 19:35:27 +03:00
|
|
|
yield from gold_examples
|
2017-05-21 17:06:17 +03:00
|
|
|
|
2019-11-23 16:32:15 +03:00
|
|
|
def train_dataset_without_preprocessing(self, nlp, gold_preproc=False,
|
|
|
|
ignore_misaligned=False):
|
|
|
|
examples = self.iter_gold_docs(nlp, self.train_examples,
|
|
|
|
gold_preproc=gold_preproc,
|
|
|
|
ignore_misaligned=ignore_misaligned)
|
2019-11-11 19:35:27 +03:00
|
|
|
yield from examples
|
2019-08-16 11:52:46 +03:00
|
|
|
|
2019-11-11 19:35:27 +03:00
|
|
|
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
|
2019-11-23 16:32:15 +03:00
|
|
|
examples = self.iter_gold_docs(nlp, self.dev_examples,
|
|
|
|
gold_preproc=gold_preproc,
|
|
|
|
ignore_misaligned=ignore_misaligned)
|
2019-11-11 19:35:27 +03:00
|
|
|
yield from examples
|
2017-05-21 17:06:17 +03:00
|
|
|
|
|
|
|
@classmethod
|
2019-11-11 19:35:27 +03:00
|
|
|
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
|
2019-11-23 16:32:15 +03:00
|
|
|
noise_level=0.0, orth_variant_level=0.0,
|
|
|
|
make_projective=False, ignore_misaligned=False):
|
2019-11-25 18:03:28 +03:00
|
|
|
""" Setting gold_preproc will result in creating a doc per sentence """
|
2019-11-11 19:35:27 +03:00
|
|
|
for example in examples:
|
2017-05-23 11:06:53 +03:00
|
|
|
if gold_preproc:
|
2019-11-25 18:03:28 +03:00
|
|
|
split_examples = example.split_sents()
|
|
|
|
example_golds = []
|
|
|
|
for split_example in split_examples:
|
|
|
|
split_example_docs = cls._make_docs(nlp, split_example,
|
|
|
|
gold_preproc, noise_level=noise_level,
|
|
|
|
orth_variant_level=orth_variant_level)
|
|
|
|
split_example_golds = cls._make_golds(split_example_docs,
|
|
|
|
vocab=nlp.vocab, make_projective=make_projective,
|
|
|
|
ignore_misaligned=ignore_misaligned)
|
|
|
|
example_golds.extend(split_example_golds)
|
2017-05-23 11:06:53 +03:00
|
|
|
else:
|
2019-11-25 18:03:28 +03:00
|
|
|
example_docs = cls._make_docs(nlp, example,
|
|
|
|
gold_preproc, noise_level=noise_level,
|
|
|
|
orth_variant_level=orth_variant_level)
|
|
|
|
example_golds = cls._make_golds(example_docs, vocab=nlp.vocab,
|
|
|
|
make_projective=make_projective,
|
|
|
|
ignore_misaligned=ignore_misaligned)
|
2019-11-23 16:32:15 +03:00
|
|
|
for ex in example_golds:
|
|
|
|
if ex.goldparse is not None:
|
2019-11-11 19:35:27 +03:00
|
|
|
if (not max_length) or len(ex.doc) < max_length:
|
|
|
|
yield ex
|
2017-05-21 17:06:17 +03:00
|
|
|
|
|
|
|
@classmethod
|
2019-11-11 19:35:27 +03:00
|
|
|
def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
|
2019-11-23 16:32:15 +03:00
|
|
|
var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
|
2019-11-11 19:35:27 +03:00
|
|
|
# gold_preproc is not used ?!
|
|
|
|
if example.text is not None:
|
|
|
|
var_text = add_noise(var_example.text, noise_level)
|
|
|
|
var_doc = nlp.make_doc(var_text)
|
|
|
|
var_example.doc = var_doc
|
2017-05-21 17:06:17 +03:00
|
|
|
else:
|
2019-11-25 18:03:28 +03:00
|
|
|
var_doc = Doc(nlp.vocab, words=add_noise(var_example.token_annotation.words, noise_level))
|
|
|
|
var_example.doc = var_doc
|
|
|
|
return [var_example]
|
2017-05-21 17:06:17 +03:00
|
|
|
|
|
|
|
@classmethod
|
2019-11-23 16:32:15 +03:00
|
|
|
def _make_golds(cls, examples, vocab=None, make_projective=False,
|
|
|
|
ignore_misaligned=False):
|
2020-01-24 00:00:24 +03:00
|
|
|
filtered_examples = []
|
2019-11-11 19:35:27 +03:00
|
|
|
for example in examples:
|
2019-11-23 16:32:15 +03:00
|
|
|
gold_parses = example.get_gold_parses(vocab=vocab,
|
|
|
|
make_projective=make_projective,
|
|
|
|
ignore_misaligned=ignore_misaligned)
|
2019-11-25 18:03:28 +03:00
|
|
|
assert len(gold_parses) == 1
|
2020-01-24 00:00:24 +03:00
|
|
|
doc, gold = gold_parses[0]
|
|
|
|
if doc:
|
|
|
|
assert doc == example.doc
|
|
|
|
example.goldparse = gold
|
|
|
|
filtered_examples.append(example)
|
|
|
|
return filtered_examples
|
2019-11-25 18:03:28 +03:00
|
|
|
|
2019-11-11 19:35:27 +03:00
|
|
|
|
|
|
|
def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
2019-08-28 10:14:20 +03:00
|
|
|
if random.random() >= orth_variant_level:
|
2019-11-11 19:35:27 +03:00
|
|
|
return example
|
2019-11-25 18:03:28 +03:00
|
|
|
if not example.token_annotation:
|
2019-11-11 19:35:27 +03:00
|
|
|
return example
|
|
|
|
raw = example.text
|
2019-09-18 22:54:51 +03:00
|
|
|
if random.random() >= 0.5:
|
|
|
|
lower = True
|
2019-09-19 01:03:24 +03:00
|
|
|
if raw is not None:
|
|
|
|
raw = raw.lower()
|
2019-08-28 14:38:54 +03:00
|
|
|
ndsv = nlp.Defaults.single_orth_variants
|
|
|
|
ndpv = nlp.Defaults.paired_orth_variants
|
|
|
|
# modify words in paragraph_tuples
|
2019-11-11 19:35:27 +03:00
|
|
|
variant_example = Example(doc=raw)
|
2019-11-25 18:03:28 +03:00
|
|
|
token_annotation = example.token_annotation
|
|
|
|
words = token_annotation.words
|
|
|
|
tags = token_annotation.tags
|
|
|
|
if not words or not tags:
|
|
|
|
# add the unmodified annotation
|
|
|
|
token_dict = token_annotation.to_dict()
|
|
|
|
variant_example.set_token_annotation(**token_dict)
|
|
|
|
else:
|
|
|
|
if lower:
|
|
|
|
words = [w.lower() for w in words]
|
|
|
|
# single variants
|
|
|
|
punct_choices = [random.choice(x["variants"]) for x in ndsv]
|
|
|
|
for word_idx in range(len(words)):
|
|
|
|
for punct_idx in range(len(ndsv)):
|
|
|
|
if tags[word_idx] in ndsv[punct_idx]["tags"] \
|
|
|
|
and words[word_idx] in ndsv[punct_idx]["variants"]:
|
|
|
|
words[word_idx] = punct_choices[punct_idx]
|
|
|
|
# paired variants
|
|
|
|
punct_choices = [random.choice(x["variants"]) for x in ndpv]
|
|
|
|
for word_idx in range(len(words)):
|
|
|
|
for punct_idx in range(len(ndpv)):
|
|
|
|
if tags[word_idx] in ndpv[punct_idx]["tags"] \
|
|
|
|
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
|
|
|
|
# backup option: random left vs. right from pair
|
|
|
|
pair_idx = random.choice([0, 1])
|
|
|
|
# best option: rely on paired POS tags like `` / ''
|
|
|
|
if len(ndpv[punct_idx]["tags"]) == 2:
|
|
|
|
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
|
|
|
|
# next best option: rely on position in variants
|
|
|
|
# (may not be unambiguous, so order of variants matters)
|
|
|
|
else:
|
|
|
|
for pair in ndpv[punct_idx]["variants"]:
|
|
|
|
if words[word_idx] in pair:
|
|
|
|
pair_idx = pair.index(words[word_idx])
|
|
|
|
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
|
|
|
|
|
|
|
token_dict = token_annotation.to_dict()
|
|
|
|
token_dict["words"] = words
|
|
|
|
token_dict["tags"] = tags
|
|
|
|
variant_example.set_token_annotation(**token_dict)
|
2019-08-28 14:38:54 +03:00
|
|
|
# modify raw to match variant_paragraph_tuples
|
|
|
|
if raw is not None:
|
|
|
|
variants = []
|
|
|
|
for single_variants in ndsv:
|
|
|
|
variants.extend(single_variants["variants"])
|
|
|
|
for paired_variants in ndpv:
|
|
|
|
variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
|
|
|
|
# store variants in reverse length order to be able to prioritize
|
|
|
|
# longer matches (e.g., "---" before "--")
|
|
|
|
variants = sorted(variants, key=lambda x: len(x))
|
|
|
|
variants.reverse()
|
|
|
|
variant_raw = ""
|
|
|
|
raw_idx = 0
|
|
|
|
# add initial whitespace
|
|
|
|
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
|
|
|
variant_raw += raw[raw_idx]
|
|
|
|
raw_idx += 1
|
2019-11-25 18:03:28 +03:00
|
|
|
for word in variant_example.token_annotation.words:
|
|
|
|
match_found = False
|
|
|
|
# add identical word
|
|
|
|
if word not in variants and raw[raw_idx:].startswith(word):
|
|
|
|
variant_raw += word
|
|
|
|
raw_idx += len(word)
|
|
|
|
match_found = True
|
|
|
|
# add variant word
|
|
|
|
else:
|
|
|
|
for variant in variants:
|
|
|
|
if not match_found and \
|
|
|
|
raw[raw_idx:].startswith(variant):
|
|
|
|
raw_idx += len(variant)
|
|
|
|
variant_raw += word
|
|
|
|
match_found = True
|
|
|
|
# something went wrong, abort
|
|
|
|
# (add a warning message?)
|
|
|
|
if not match_found:
|
|
|
|
return example
|
|
|
|
# add following whitespace
|
|
|
|
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
|
|
|
variant_raw += raw[raw_idx]
|
|
|
|
raw_idx += 1
|
2019-11-11 19:35:27 +03:00
|
|
|
variant_example.doc = variant_raw
|
|
|
|
return variant_example
|
|
|
|
return variant_example
|
2019-08-28 10:14:20 +03:00
|
|
|
|
|
|
|
|
2017-06-05 04:16:57 +03:00
|
|
|
def add_noise(orig, noise_level):
|
|
|
|
if random.random() >= noise_level:
|
|
|
|
return orig
|
|
|
|
elif type(orig) == list:
|
2019-08-29 16:39:32 +03:00
|
|
|
corrupted = [_corrupt(word, noise_level) for word in orig]
|
2017-06-05 04:16:57 +03:00
|
|
|
corrupted = [w for w in corrupted if w]
|
|
|
|
return corrupted
|
|
|
|
else:
|
2019-08-29 16:39:32 +03:00
|
|
|
return "".join(_corrupt(c, noise_level) for c in orig)
|
2017-06-05 04:16:57 +03:00
|
|
|
|
|
|
|
|
2019-08-29 16:39:32 +03:00
|
|
|
def _corrupt(c, noise_level):
|
2017-06-05 04:16:57 +03:00
|
|
|
if random.random() >= noise_level:
|
|
|
|
return c
|
2019-03-08 13:42:26 +03:00
|
|
|
elif c in [".", "'", "!", "?", ","]:
|
2019-08-29 16:39:32 +03:00
|
|
|
return "\n"
|
2017-06-05 04:16:57 +03:00
|
|
|
else:
|
|
|
|
return c.lower()
|
|
|
|
|
|
|
|
|
2018-11-30 22:16:14 +03:00
|
|
|
def read_json_object(json_corpus_section):
|
|
|
|
"""Take a list of JSON-formatted documents (e.g. from an already loaded
|
2019-11-11 19:35:27 +03:00
|
|
|
training data file) and yield annotations in the GoldParse format.
|
2018-11-30 22:16:14 +03:00
|
|
|
|
|
|
|
json_corpus_section (list): The data.
|
2019-11-11 19:35:27 +03:00
|
|
|
YIELDS (Example): The reformatted data - one training example per paragraph
|
2018-11-30 22:16:14 +03:00
|
|
|
"""
|
|
|
|
for json_doc in json_corpus_section:
|
2019-11-11 19:35:27 +03:00
|
|
|
examples = json_to_examples(json_doc)
|
|
|
|
for ex in examples:
|
|
|
|
yield ex
|
2018-11-30 22:16:14 +03:00
|
|
|
|
|
|
|
|
2019-11-11 19:35:27 +03:00
|
|
|
def json_to_examples(doc):
|
|
|
|
"""Convert an item in the JSON-formatted training data to the format
|
2018-11-30 22:16:14 +03:00
|
|
|
used by GoldParse.
|
|
|
|
|
|
|
|
doc (dict): One entry in the training data.
|
2019-11-11 19:35:27 +03:00
|
|
|
YIELDS (Example): The reformatted data - one training example per paragraph
|
2018-11-30 22:16:14 +03:00
|
|
|
"""
|
|
|
|
paragraphs = []
|
2019-03-08 13:42:26 +03:00
|
|
|
for paragraph in doc["paragraphs"]:
|
2019-11-11 19:35:27 +03:00
|
|
|
example = Example(doc=paragraph.get("raw", None))
|
2019-11-25 18:03:28 +03:00
|
|
|
words = []
|
|
|
|
ids = []
|
|
|
|
tags = []
|
2020-01-28 13:36:29 +03:00
|
|
|
pos = []
|
|
|
|
morphs = []
|
|
|
|
lemmas = []
|
2019-11-25 18:03:28 +03:00
|
|
|
heads = []
|
|
|
|
labels = []
|
|
|
|
ner = []
|
|
|
|
sent_starts = []
|
|
|
|
brackets = []
|
2019-03-08 13:42:26 +03:00
|
|
|
for sent in paragraph["sentences"]:
|
2019-11-25 18:03:28 +03:00
|
|
|
sent_start_i = len(words)
|
2019-03-08 13:42:26 +03:00
|
|
|
for i, token in enumerate(sent["tokens"]):
|
|
|
|
words.append(token["orth"])
|
2019-11-25 18:03:28 +03:00
|
|
|
ids.append(token.get('id', sent_start_i + i))
|
2019-03-08 13:42:26 +03:00
|
|
|
tags.append(token.get('tag', "-"))
|
2020-01-28 13:36:29 +03:00
|
|
|
pos.append(token.get("pos", ""))
|
|
|
|
morphs.append(token.get("morph", ""))
|
|
|
|
lemmas.append(token.get("lemma", ""))
|
2019-11-25 18:03:28 +03:00
|
|
|
heads.append(token.get("head", 0) + sent_start_i + i)
|
2019-03-08 13:42:26 +03:00
|
|
|
labels.append(token.get("dep", ""))
|
2018-11-30 22:16:14 +03:00
|
|
|
# Ensure ROOT label is case-insensitive
|
2019-03-08 13:42:26 +03:00
|
|
|
if labels[-1].lower() == "root":
|
|
|
|
labels[-1] = "ROOT"
|
|
|
|
ner.append(token.get("ner", "-"))
|
2019-11-25 18:03:28 +03:00
|
|
|
if i == 0:
|
2019-11-28 13:10:07 +03:00
|
|
|
sent_starts.append(1)
|
2019-11-25 18:03:28 +03:00
|
|
|
else:
|
2019-11-28 13:10:07 +03:00
|
|
|
sent_starts.append(0)
|
2019-11-25 18:03:28 +03:00
|
|
|
if "brackets" in sent:
|
|
|
|
brackets.extend((b["first"] + sent_start_i,
|
|
|
|
b["last"] + sent_start_i, b["label"])
|
|
|
|
for b in sent["brackets"])
|
2019-11-11 19:35:27 +03:00
|
|
|
cats = {}
|
|
|
|
for cat in paragraph.get("cats", {}):
|
|
|
|
cats[cat["label"]] = cat["value"]
|
2019-11-25 18:03:28 +03:00
|
|
|
example.set_token_annotation(ids=ids, words=words, tags=tags,
|
2020-01-28 13:36:29 +03:00
|
|
|
pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
|
|
|
|
deps=labels, entities=ner, sent_starts=sent_starts,
|
|
|
|
brackets=brackets)
|
2019-11-25 18:03:28 +03:00
|
|
|
example.set_doc_annotation(cats=cats)
|
2019-11-11 19:35:27 +03:00
|
|
|
yield example
|
2018-11-30 22:16:14 +03:00
|
|
|
|
|
|
|
|
2017-05-22 12:48:02 +03:00
|
|
|
def read_json_file(loc, docs_filter=None, limit=None):
|
2017-10-27 22:07:59 +03:00
|
|
|
loc = util.ensure_path(loc)
|
2017-04-15 13:13:00 +03:00
|
|
|
if loc.is_dir():
|
|
|
|
for filename in loc.iterdir():
|
2017-05-17 13:04:50 +03:00
|
|
|
yield from read_json_file(loc / filename, limit=limit)
|
2015-05-29 04:52:55 +03:00
|
|
|
else:
|
2018-03-27 20:23:02 +03:00
|
|
|
for doc in _json_iterate(loc):
|
2015-06-12 03:42:08 +03:00
|
|
|
if docs_filter is not None and not docs_filter(doc):
|
|
|
|
continue
|
2019-11-11 19:35:27 +03:00
|
|
|
for json_data in json_to_examples(doc):
|
|
|
|
yield json_data
|
2015-05-06 17:27:31 +03:00
|
|
|
|
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
def _json_iterate(loc):
|
|
|
|
# We should've made these files jsonl...But since we didn't, parse out
|
|
|
|
# the docs one-by-one to reduce memory usage.
|
|
|
|
# It's okay to read in the whole file -- just don't parse it into JSON.
|
|
|
|
cdef bytes py_raw
|
|
|
|
loc = util.ensure_path(loc)
|
2019-03-08 13:42:26 +03:00
|
|
|
with loc.open("rb") as file_:
|
2018-03-27 20:23:02 +03:00
|
|
|
py_raw = file_.read()
|
2019-12-21 23:12:19 +03:00
|
|
|
cdef long file_length = len(py_raw)
|
|
|
|
if file_length > 2 ** 30:
|
2020-02-28 14:20:23 +03:00
|
|
|
warnings.warn(Warnings.W027.format(size=file_length))
|
2019-12-21 23:12:19 +03:00
|
|
|
|
2018-03-27 20:23:02 +03:00
|
|
|
raw = <char*>py_raw
|
|
|
|
cdef int square_depth = 0
|
|
|
|
cdef int curly_depth = 0
|
|
|
|
cdef int inside_string = 0
|
|
|
|
cdef int escape = 0
|
2019-12-21 23:12:19 +03:00
|
|
|
cdef long start = -1
|
2018-03-27 20:23:02 +03:00
|
|
|
cdef char c
|
|
|
|
cdef char quote = ord('"')
|
2019-03-08 13:42:26 +03:00
|
|
|
cdef char backslash = ord("\\")
|
|
|
|
cdef char open_square = ord("[")
|
|
|
|
cdef char close_square = ord("]")
|
|
|
|
cdef char open_curly = ord("{")
|
|
|
|
cdef char close_curly = ord("}")
|
2019-12-21 23:12:19 +03:00
|
|
|
for i in range(file_length):
|
2018-03-27 20:23:02 +03:00
|
|
|
c = raw[i]
|
|
|
|
if escape:
|
|
|
|
escape = False
|
|
|
|
continue
|
2018-12-08 12:41:24 +03:00
|
|
|
if c == backslash:
|
|
|
|
escape = True
|
|
|
|
continue
|
2018-03-27 20:23:02 +03:00
|
|
|
if c == quote:
|
|
|
|
inside_string = not inside_string
|
|
|
|
continue
|
|
|
|
if inside_string:
|
|
|
|
continue
|
|
|
|
if c == open_square:
|
|
|
|
square_depth += 1
|
|
|
|
elif c == close_square:
|
|
|
|
square_depth -= 1
|
|
|
|
elif c == open_curly:
|
|
|
|
if square_depth == 1 and curly_depth == 0:
|
|
|
|
start = i
|
|
|
|
curly_depth += 1
|
|
|
|
elif c == close_curly:
|
|
|
|
curly_depth -= 1
|
|
|
|
if square_depth == 1 and curly_depth == 0:
|
2019-03-08 13:42:26 +03:00
|
|
|
py_str = py_raw[start : i + 1].decode("utf8")
|
2018-08-14 15:03:48 +03:00
|
|
|
try:
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
yield srsly.json_loads(py_str)
|
2018-11-28 20:04:58 +03:00
|
|
|
except Exception:
|
2018-08-14 15:03:48 +03:00
|
|
|
print(py_str)
|
|
|
|
raise
|
2018-03-27 20:23:02 +03:00
|
|
|
start = -1
|
|
|
|
|
|
|
|
|
2017-05-26 19:32:55 +03:00
|
|
|
def iob_to_biluo(tags):
|
2015-04-10 05:59:11 +03:00
|
|
|
out = []
|
|
|
|
tags = list(tags)
|
|
|
|
while tags:
|
|
|
|
out.extend(_consume_os(tags))
|
|
|
|
out.extend(_consume_ent(tags))
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
2020-05-18 23:23:33 +03:00
|
|
|
def biluo_to_iob(tags):
|
|
|
|
out = []
|
|
|
|
for tag in tags:
|
|
|
|
tag = tag.replace("U-", "B-", 1).replace("L-", "I-", 1)
|
|
|
|
out.append(tag)
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
2015-04-10 05:59:11 +03:00
|
|
|
def _consume_os(tags):
|
2019-03-08 13:42:26 +03:00
|
|
|
while tags and tags[0] == "O":
|
2015-04-10 05:59:11 +03:00
|
|
|
yield tags.pop(0)
|
|
|
|
|
|
|
|
|
|
|
|
def _consume_ent(tags):
|
|
|
|
if not tags:
|
|
|
|
return []
|
2018-05-30 13:28:44 +03:00
|
|
|
tag = tags.pop(0)
|
2019-03-08 13:42:26 +03:00
|
|
|
target_in = "I" + tag[1:]
|
|
|
|
target_last = "L" + tag[1:]
|
2015-04-10 05:59:11 +03:00
|
|
|
length = 1
|
2018-05-30 13:28:44 +03:00
|
|
|
while tags and tags[0] in {target_in, target_last}:
|
2015-04-10 05:59:11 +03:00
|
|
|
length += 1
|
|
|
|
tags.pop(0)
|
2018-05-30 13:28:44 +03:00
|
|
|
label = tag[2:]
|
2015-04-10 05:59:11 +03:00
|
|
|
if length == 1:
|
2019-10-21 13:20:28 +03:00
|
|
|
if len(label) == 0:
|
|
|
|
raise ValueError(Errors.E177.format(tag=tag))
|
2019-03-08 13:42:26 +03:00
|
|
|
return ["U-" + label]
|
2015-04-10 05:59:11 +03:00
|
|
|
else:
|
2019-03-08 13:42:26 +03:00
|
|
|
start = "B-" + label
|
|
|
|
end = "L-" + label
|
2019-12-25 19:59:52 +03:00
|
|
|
middle = [f"I-{label}" for _ in range(1, length - 1)]
|
2015-04-10 05:59:11 +03:00
|
|
|
return [start] + middle + [end]
|
|
|
|
|
|
|
|
|
2019-11-11 19:35:27 +03:00
|
|
|
cdef class TokenAnnotation:
|
2020-01-28 13:36:29 +03:00
|
|
|
def __init__(self, ids=None, words=None, tags=None, pos=None, morphs=None,
|
|
|
|
lemmas=None, heads=None, deps=None, entities=None, sent_starts=None,
|
2019-11-28 16:53:44 +03:00
|
|
|
brackets=None):
|
2019-11-11 19:35:27 +03:00
|
|
|
self.ids = ids if ids else []
|
|
|
|
self.words = words if words else []
|
|
|
|
self.tags = tags if tags else []
|
2020-01-28 13:36:29 +03:00
|
|
|
self.pos = pos if pos else []
|
|
|
|
self.morphs = morphs if morphs else []
|
|
|
|
self.lemmas = lemmas if lemmas else []
|
2019-11-11 19:35:27 +03:00
|
|
|
self.heads = heads if heads else []
|
|
|
|
self.deps = deps if deps else []
|
|
|
|
self.entities = entities if entities else []
|
2019-11-25 18:03:28 +03:00
|
|
|
self.sent_starts = sent_starts if sent_starts else []
|
2019-11-11 19:35:27 +03:00
|
|
|
self.brackets = brackets if brackets else []
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_dict(cls, token_dict):
|
|
|
|
return cls(ids=token_dict.get("ids", None),
|
|
|
|
words=token_dict.get("words", None),
|
|
|
|
tags=token_dict.get("tags", None),
|
2020-01-28 13:36:29 +03:00
|
|
|
pos=token_dict.get("pos", None),
|
|
|
|
morphs=token_dict.get("morphs", None),
|
|
|
|
lemmas=token_dict.get("lemmas", None),
|
2019-11-11 19:35:27 +03:00
|
|
|
heads=token_dict.get("heads", None),
|
|
|
|
deps=token_dict.get("deps", None),
|
|
|
|
entities=token_dict.get("entities", None),
|
2019-11-25 18:03:28 +03:00
|
|
|
sent_starts=token_dict.get("sent_starts", None),
|
2019-11-11 19:35:27 +03:00
|
|
|
brackets=token_dict.get("brackets", None))
|
|
|
|
|
|
|
|
def to_dict(self):
|
|
|
|
return {"ids": self.ids,
|
|
|
|
"words": self.words,
|
|
|
|
"tags": self.tags,
|
2020-01-28 13:36:29 +03:00
|
|
|
"pos": self.pos,
|
|
|
|
"morphs": self.morphs,
|
|
|
|
"lemmas": self.lemmas,
|
2019-11-11 19:35:27 +03:00
|
|
|
"heads": self.heads,
|
|
|
|
"deps": self.deps,
|
|
|
|
"entities": self.entities,
|
2019-11-25 18:03:28 +03:00
|
|
|
"sent_starts": self.sent_starts,
|
2019-11-11 19:35:27 +03:00
|
|
|
"brackets": self.brackets}
|
|
|
|
|
2019-11-25 18:03:28 +03:00
|
|
|
def get_id(self, i):
|
|
|
|
return self.ids[i] if i < len(self.ids) else i
|
|
|
|
|
|
|
|
def get_word(self, i):
|
|
|
|
return self.words[i] if i < len(self.words) else ""
|
|
|
|
|
|
|
|
def get_tag(self, i):
|
|
|
|
return self.tags[i] if i < len(self.tags) else "-"
|
|
|
|
|
2020-01-28 13:36:29 +03:00
|
|
|
def get_pos(self, i):
|
|
|
|
return self.pos[i] if i < len(self.pos) else ""
|
|
|
|
|
|
|
|
def get_morph(self, i):
|
|
|
|
return self.morphs[i] if i < len(self.morphs) else ""
|
|
|
|
|
|
|
|
def get_lemma(self, i):
|
|
|
|
return self.lemmas[i] if i < len(self.lemmas) else ""
|
|
|
|
|
2019-11-25 18:03:28 +03:00
|
|
|
def get_head(self, i):
|
|
|
|
return self.heads[i] if i < len(self.heads) else i
|
|
|
|
|
|
|
|
def get_dep(self, i):
|
|
|
|
return self.deps[i] if i < len(self.deps) else ""
|
|
|
|
|
|
|
|
def get_entity(self, i):
|
|
|
|
return self.entities[i] if i < len(self.entities) else "-"
|
|
|
|
|
|
|
|
def get_sent_start(self, i):
|
|
|
|
return self.sent_starts[i] if i < len(self.sent_starts) else None
|
|
|
|
|
2020-05-20 12:41:12 +03:00
|
|
|
def __str__(self):
|
|
|
|
return str(self.to_dict())
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return self.__str__()
|
|
|
|
|
2019-11-11 19:35:27 +03:00
|
|
|
|
|
|
|
cdef class DocAnnotation:
|
|
|
|
def __init__(self, cats=None, links=None):
|
|
|
|
self.cats = cats if cats else {}
|
|
|
|
self.links = links if links else {}
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_dict(cls, doc_dict):
|
|
|
|
return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None))
|
|
|
|
|
|
|
|
def to_dict(self):
|
|
|
|
return {"cats": self.cats, "links": self.links}
|
|
|
|
|
2020-05-20 12:41:12 +03:00
|
|
|
def __str__(self):
|
|
|
|
return str(self.to_dict())
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return self.__str__()
|
|
|
|
|
2019-11-11 19:35:27 +03:00
|
|
|
|
|
|
|
cdef class Example:
|
2019-11-25 18:03:28 +03:00
|
|
|
def __init__(self, doc_annotation=None, token_annotation=None, doc=None,
|
2019-11-23 16:32:15 +03:00
|
|
|
goldparse=None):
|
2019-11-11 19:35:27 +03:00
|
|
|
""" Doc can either be text, or an actual Doc """
|
|
|
|
self.doc = doc
|
|
|
|
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
|
2019-11-25 18:03:28 +03:00
|
|
|
self.token_annotation = token_annotation if token_annotation else TokenAnnotation()
|
2019-11-11 19:35:27 +03:00
|
|
|
self.goldparse = goldparse
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_gold(cls, goldparse, doc=None):
|
|
|
|
doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
|
|
|
|
token_annotation = goldparse.get_token_annotation()
|
2019-11-25 18:03:28 +03:00
|
|
|
return cls(doc_annotation, token_annotation, doc)
|
2019-11-11 19:35:27 +03:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_dict(cls, example_dict, doc=None):
|
2020-05-20 12:41:12 +03:00
|
|
|
token_dict = example_dict.get("token_annotation", {})
|
2019-11-25 18:03:28 +03:00
|
|
|
token_annotation = TokenAnnotation.from_dict(token_dict)
|
2020-05-20 12:41:12 +03:00
|
|
|
doc_dict = example_dict.get("doc_annotation", {})
|
2019-11-11 19:35:27 +03:00
|
|
|
doc_annotation = DocAnnotation.from_dict(doc_dict)
|
2019-11-25 18:03:28 +03:00
|
|
|
return cls(doc_annotation, token_annotation, doc)
|
2019-11-11 19:35:27 +03:00
|
|
|
|
|
|
|
def to_dict(self):
|
|
|
|
""" Note that this method does NOT export the doc, only the annotations ! """
|
2019-11-25 18:03:28 +03:00
|
|
|
token_dict = self.token_annotation.to_dict()
|
2019-11-11 19:35:27 +03:00
|
|
|
doc_dict = self.doc_annotation.to_dict()
|
2019-11-25 18:03:28 +03:00
|
|
|
return {"token_annotation": token_dict, "doc_annotation": doc_dict}
|
2019-11-11 19:35:27 +03:00
|
|
|
|
|
|
|
@property
|
|
|
|
def text(self):
|
|
|
|
if self.doc is None:
|
|
|
|
return None
|
|
|
|
if isinstance(self.doc, Doc):
|
|
|
|
return self.doc.text
|
|
|
|
return self.doc
|
|
|
|
|
|
|
|
@property
|
|
|
|
def gold(self):
|
|
|
|
if self.goldparse is None:
|
2019-11-25 18:03:28 +03:00
|
|
|
doc, gold = self.get_gold_parses()[0]
|
2019-11-11 19:35:27 +03:00
|
|
|
self.goldparse = gold
|
|
|
|
return self.goldparse
|
|
|
|
|
2020-01-28 13:36:29 +03:00
|
|
|
def set_token_annotation(self, ids=None, words=None, tags=None, pos=None,
|
|
|
|
morphs=None, lemmas=None, heads=None, deps=None,
|
|
|
|
entities=None, sent_starts=None, brackets=None):
|
2019-11-25 18:03:28 +03:00
|
|
|
self.token_annotation = TokenAnnotation(ids=ids, words=words, tags=tags,
|
2020-01-28 13:36:29 +03:00
|
|
|
pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
|
|
|
|
deps=deps, entities=entities,
|
2019-11-28 16:53:44 +03:00
|
|
|
sent_starts=sent_starts, brackets=brackets)
|
2019-11-11 19:35:27 +03:00
|
|
|
|
2019-11-25 18:03:28 +03:00
|
|
|
def set_doc_annotation(self, cats=None, links=None):
|
2019-11-11 19:35:27 +03:00
|
|
|
if cats:
|
2019-11-25 18:03:28 +03:00
|
|
|
self.doc_annotation.cats = cats
|
2019-11-11 19:35:27 +03:00
|
|
|
if links:
|
2019-11-25 18:03:28 +03:00
|
|
|
self.doc_annotation.links = links
|
|
|
|
|
|
|
|
def split_sents(self):
|
|
|
|
""" Split the token annotations into multiple Examples based on
|
|
|
|
sent_starts and return a list of the new Examples"""
|
2020-05-20 12:41:12 +03:00
|
|
|
if not self.token_annotation.words:
|
|
|
|
return [self]
|
2019-11-25 18:03:28 +03:00
|
|
|
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
2020-01-28 13:36:29 +03:00
|
|
|
s_ids, s_words, s_tags, s_pos, s_morphs = [], [], [], [], []
|
|
|
|
s_lemmas, s_heads, s_deps, s_ents, s_sent_starts = [], [], [], [], []
|
2019-11-25 18:03:28 +03:00
|
|
|
s_brackets = []
|
|
|
|
sent_start_i = 0
|
|
|
|
t = self.token_annotation
|
|
|
|
split_examples = []
|
|
|
|
for i in range(len(t.words)):
|
2019-11-28 13:10:07 +03:00
|
|
|
if i > 0 and t.sent_starts[i] == 1:
|
2019-11-25 18:03:28 +03:00
|
|
|
s_example.set_token_annotation(ids=s_ids,
|
2020-01-28 13:36:29 +03:00
|
|
|
words=s_words, tags=s_tags, pos=s_pos, morphs=s_morphs,
|
|
|
|
lemmas=s_lemmas, heads=s_heads, deps=s_deps,
|
|
|
|
entities=s_ents, sent_starts=s_sent_starts,
|
|
|
|
brackets=s_brackets)
|
2019-11-25 18:03:28 +03:00
|
|
|
split_examples.append(s_example)
|
|
|
|
s_example = Example(doc=None, doc_annotation=self.doc_annotation)
|
2020-01-28 13:36:29 +03:00
|
|
|
s_ids, s_words, s_tags, s_pos, s_heads = [], [], [], [], []
|
2019-11-28 16:53:44 +03:00
|
|
|
s_deps, s_ents, s_morphs, s_lemmas = [], [], [], []
|
|
|
|
s_sent_starts, s_brackets = [], []
|
2019-11-25 18:03:28 +03:00
|
|
|
sent_start_i = i
|
|
|
|
s_ids.append(t.get_id(i))
|
|
|
|
s_words.append(t.get_word(i))
|
|
|
|
s_tags.append(t.get_tag(i))
|
2020-01-28 13:36:29 +03:00
|
|
|
s_pos.append(t.get_pos(i))
|
|
|
|
s_morphs.append(t.get_morph(i))
|
|
|
|
s_lemmas.append(t.get_lemma(i))
|
2019-11-25 18:03:28 +03:00
|
|
|
s_heads.append(t.get_head(i) - sent_start_i)
|
|
|
|
s_deps.append(t.get_dep(i))
|
|
|
|
s_ents.append(t.get_entity(i))
|
|
|
|
s_sent_starts.append(t.get_sent_start(i))
|
|
|
|
s_brackets.extend((b[0] - sent_start_i,
|
|
|
|
b[1] - sent_start_i, b[2])
|
|
|
|
for b in t.brackets if b[0] == i)
|
|
|
|
i += 1
|
|
|
|
s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags,
|
2020-01-28 13:36:29 +03:00
|
|
|
pos=s_pos, morphs=s_morphs, lemmas=s_lemmas, heads=s_heads,
|
|
|
|
deps=s_deps, entities=s_ents, sent_starts=s_sent_starts,
|
2019-11-25 18:03:28 +03:00
|
|
|
brackets=s_brackets)
|
|
|
|
split_examples.append(s_example)
|
|
|
|
return split_examples
|
2019-11-11 19:35:27 +03:00
|
|
|
|
2019-11-25 18:03:28 +03:00
|
|
|
|
|
|
|
def get_gold_parses(self, merge=True, vocab=None, make_projective=False,
|
2019-11-23 16:32:15 +03:00
|
|
|
ignore_misaligned=False):
|
2019-11-11 19:35:27 +03:00
|
|
|
"""Return a list of (doc, GoldParse) objects.
|
2019-11-25 18:03:28 +03:00
|
|
|
If merge is set to True, keep all Token annotations as one big list."""
|
2019-11-11 19:35:27 +03:00
|
|
|
d = self.doc_annotation
|
2019-11-25 18:03:28 +03:00
|
|
|
# merge == do not modify Example
|
2019-11-11 19:35:27 +03:00
|
|
|
if merge:
|
2019-11-25 18:03:28 +03:00
|
|
|
t = self.token_annotation
|
|
|
|
doc = self.doc
|
2020-05-20 12:41:12 +03:00
|
|
|
if doc is None or not isinstance(doc, Doc):
|
2019-11-11 19:35:27 +03:00
|
|
|
if not vocab:
|
|
|
|
raise ValueError(Errors.E998)
|
2019-11-25 18:03:28 +03:00
|
|
|
doc = Doc(vocab, words=t.words)
|
2019-11-11 19:35:27 +03:00
|
|
|
try:
|
2019-11-25 18:03:28 +03:00
|
|
|
gp = GoldParse.from_annotation(doc, d, t,
|
|
|
|
make_projective=make_projective)
|
2019-11-11 19:35:27 +03:00
|
|
|
except AlignmentError:
|
2019-11-23 16:32:15 +03:00
|
|
|
if ignore_misaligned:
|
2019-11-11 19:35:27 +03:00
|
|
|
gp = None
|
|
|
|
else:
|
|
|
|
raise
|
2019-11-25 18:03:28 +03:00
|
|
|
return [(doc, gp)]
|
|
|
|
# not merging: one GoldParse per sentence, defining docs with the words
|
|
|
|
# from each sentence
|
2019-11-11 19:35:27 +03:00
|
|
|
else:
|
|
|
|
parses = []
|
2019-11-25 18:03:28 +03:00
|
|
|
split_examples = self.split_sents()
|
|
|
|
for split_example in split_examples:
|
2019-11-11 19:35:27 +03:00
|
|
|
if not vocab:
|
|
|
|
raise ValueError(Errors.E998)
|
2019-11-25 18:03:28 +03:00
|
|
|
split_doc = Doc(vocab, words=split_example.token_annotation.words)
|
2019-11-11 19:35:27 +03:00
|
|
|
try:
|
2019-11-25 18:03:28 +03:00
|
|
|
gp = GoldParse.from_annotation(split_doc, d,
|
|
|
|
split_example.token_annotation,
|
|
|
|
make_projective=make_projective)
|
2019-11-11 19:35:27 +03:00
|
|
|
except AlignmentError:
|
2019-11-23 16:32:15 +03:00
|
|
|
if ignore_misaligned:
|
2019-11-11 19:35:27 +03:00
|
|
|
gp = None
|
|
|
|
else:
|
|
|
|
raise
|
|
|
|
if gp is not None:
|
2019-11-25 18:03:28 +03:00
|
|
|
parses.append((split_doc, gp))
|
2019-11-11 19:35:27 +03:00
|
|
|
return parses
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
|
|
|
|
"""
|
|
|
|
Return a list of Example objects, from a variety of input formats.
|
|
|
|
make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
|
|
|
|
"""
|
|
|
|
if isinstance(examples, Example):
|
|
|
|
return [examples]
|
|
|
|
if isinstance(examples, tuple):
|
|
|
|
examples = [examples]
|
|
|
|
converted_examples = []
|
|
|
|
for ex in examples:
|
|
|
|
# convert string to Doc to Example
|
2019-12-22 03:53:56 +03:00
|
|
|
if isinstance(ex, str):
|
2019-11-11 19:35:27 +03:00
|
|
|
if keep_raw_text:
|
|
|
|
converted_examples.append(Example(doc=ex))
|
|
|
|
else:
|
|
|
|
doc = make_doc(ex)
|
|
|
|
converted_examples.append(Example(doc=doc))
|
|
|
|
# convert Doc to Example
|
|
|
|
elif isinstance(ex, Doc):
|
|
|
|
converted_examples.append(Example(doc=ex))
|
|
|
|
# convert tuples to Example
|
|
|
|
elif isinstance(ex, tuple) and len(ex) == 2:
|
|
|
|
doc, gold = ex
|
|
|
|
gold_dict = {}
|
|
|
|
# convert string to Doc
|
2019-12-22 03:53:56 +03:00
|
|
|
if isinstance(doc, str) and not keep_raw_text:
|
2019-11-11 19:35:27 +03:00
|
|
|
doc = make_doc(doc)
|
|
|
|
# convert dict to GoldParse
|
|
|
|
if isinstance(gold, dict):
|
|
|
|
gold_dict = gold
|
|
|
|
if doc is not None or gold.get("words", None) is not None:
|
|
|
|
gold = GoldParse(doc, **gold)
|
|
|
|
else:
|
|
|
|
gold = None
|
|
|
|
if gold is not None:
|
|
|
|
converted_examples.append(Example.from_gold(goldparse=gold, doc=doc))
|
|
|
|
else:
|
|
|
|
raise ValueError(Errors.E999.format(gold_dict=gold_dict))
|
|
|
|
else:
|
|
|
|
converted_examples.append(ex)
|
|
|
|
return converted_examples
|
|
|
|
|
|
|
|
|
2015-03-09 08:46:22 +03:00
|
|
|
cdef class GoldParse:
|
2019-03-08 13:42:26 +03:00
|
|
|
"""Collection for training annotations.
|
|
|
|
|
|
|
|
DOCS: https://spacy.io/api/goldparse
|
|
|
|
"""
|
2016-10-15 23:09:52 +03:00
|
|
|
@classmethod
|
2019-11-11 19:35:27 +03:00
|
|
|
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
|
2019-11-25 18:03:28 +03:00
|
|
|
return cls(doc, words=token_annotation.words,
|
|
|
|
tags=token_annotation.tags,
|
2020-01-28 13:36:29 +03:00
|
|
|
pos=token_annotation.pos,
|
|
|
|
morphs=token_annotation.morphs,
|
|
|
|
lemmas=token_annotation.lemmas,
|
2019-11-25 18:03:28 +03:00
|
|
|
heads=token_annotation.heads,
|
|
|
|
deps=token_annotation.deps,
|
|
|
|
entities=token_annotation.entities,
|
2019-11-28 13:10:07 +03:00
|
|
|
sent_starts=token_annotation.sent_starts,
|
2019-11-25 18:03:28 +03:00
|
|
|
cats=doc_annotation.cats,
|
|
|
|
links=doc_annotation.links,
|
2019-09-15 23:31:31 +03:00
|
|
|
make_projective=make_projective)
|
2016-10-16 00:53:29 +03:00
|
|
|
|
2019-11-11 19:35:27 +03:00
|
|
|
def get_token_annotation(self):
|
|
|
|
ids = None
|
|
|
|
if self.words:
|
|
|
|
ids = list(range(len(self.words)))
|
|
|
|
|
|
|
|
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
|
2020-01-28 13:36:29 +03:00
|
|
|
pos=self.pos, morphs=self.morphs,
|
|
|
|
lemmas=self.lemmas, heads=self.heads,
|
|
|
|
deps=self.labels, entities=self.ner,
|
|
|
|
sent_starts=self.sent_starts)
|
|
|
|
|
|
|
|
def __init__(self, doc, words=None, tags=None, pos=None, morphs=None,
|
|
|
|
lemmas=None, heads=None, deps=None, entities=None,
|
|
|
|
sent_starts=None, make_projective=False, cats=None,
|
|
|
|
links=None):
|
2019-10-22 17:54:33 +03:00
|
|
|
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
|
2017-05-21 14:53:46 +03:00
|
|
|
|
|
|
|
doc (Doc): The document the annotations refer to.
|
|
|
|
words (iterable): A sequence of unicode word strings.
|
|
|
|
tags (iterable): A sequence of strings, representing tag annotations.
|
2020-01-28 13:36:29 +03:00
|
|
|
pos (iterable): A sequence of strings, representing UPOS annotations.
|
|
|
|
morphs (iterable): A sequence of strings, representing morph
|
|
|
|
annotations.
|
|
|
|
lemmas (iterable): A sequence of strings, representing lemma
|
|
|
|
annotations.
|
2017-10-27 18:02:55 +03:00
|
|
|
heads (iterable): A sequence of integers, representing syntactic
|
|
|
|
head offsets.
|
|
|
|
deps (iterable): A sequence of strings, representing the syntactic
|
|
|
|
relation types.
|
2017-05-21 14:53:46 +03:00
|
|
|
entities (iterable): A sequence of named entity annotations, either as
|
|
|
|
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
|
|
|
representing the entity positions.
|
2019-11-28 13:10:07 +03:00
|
|
|
sent_starts (iterable): A sequence of sentence position tags, 1 for
|
|
|
|
the first word in a sentence, 0 for all others.
|
2017-10-06 02:43:02 +03:00
|
|
|
cats (dict): Labels for text classification. Each key in the dictionary
|
|
|
|
may be a string or an int, or a `(start_char, end_char, label)`
|
2017-07-20 01:17:47 +03:00
|
|
|
tuple, indicating that the label is applied to only part of the
|
|
|
|
document (usually a sentence). Unlike entity annotations, label
|
|
|
|
annotations can overlap, i.e. a single word can be covered by
|
2017-10-06 02:43:02 +03:00
|
|
|
multiple labelled spans. The TextCategorizer component expects
|
2017-10-27 18:02:55 +03:00
|
|
|
true examples of a label to have the value 1.0, and negative
|
|
|
|
examples of a label to have the value 0.0. Labels not in the
|
|
|
|
dictionary are treated as missing - the gradient for those labels
|
|
|
|
will be zero.
|
2019-07-19 13:36:15 +03:00
|
|
|
links (dict): A dict with `(start_char, end_char)` keys,
|
|
|
|
and the values being dicts with kb_id:value entries,
|
|
|
|
representing the external IDs in a knowledge base (KB)
|
|
|
|
mapped to either 1.0 or 0.0, indicating positive and
|
|
|
|
negative examples respectively.
|
2017-05-21 14:53:46 +03:00
|
|
|
RETURNS (GoldParse): The newly constructed object.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2015-03-09 08:46:22 +03:00
|
|
|
self.mem = Pool()
|
|
|
|
self.loss = 0
|
2016-10-16 00:55:07 +03:00
|
|
|
self.length = len(doc)
|
2015-03-09 08:46:22 +03:00
|
|
|
|
2017-10-06 02:43:02 +03:00
|
|
|
self.cats = {} if cats is None else dict(cats)
|
2019-11-11 19:35:27 +03:00
|
|
|
self.links = {} if links is None else dict(links)
|
2019-10-22 17:54:33 +03:00
|
|
|
|
|
|
|
# avoid allocating memory if the doc does not contain any tokens
|
2020-03-29 14:57:00 +03:00
|
|
|
if self.length == 0:
|
|
|
|
# set a minimal orig so that the scorer can score an empty doc
|
|
|
|
self.orig = TokenAnnotation(ids=[])
|
|
|
|
else:
|
2019-11-11 19:35:27 +03:00
|
|
|
if not words:
|
2019-10-22 17:54:33 +03:00
|
|
|
words = [token.text for token in doc]
|
2019-11-11 19:35:27 +03:00
|
|
|
if not tags:
|
2019-10-22 17:54:33 +03:00
|
|
|
tags = [None for _ in words]
|
2020-01-28 13:36:29 +03:00
|
|
|
if not pos:
|
|
|
|
pos = [None for _ in words]
|
2019-11-25 18:03:28 +03:00
|
|
|
if not morphs:
|
|
|
|
morphs = [None for _ in words]
|
2019-11-28 16:53:44 +03:00
|
|
|
if not lemmas:
|
|
|
|
lemmas = [None for _ in words]
|
2020-01-28 13:36:29 +03:00
|
|
|
if not heads:
|
|
|
|
heads = [None for _ in words]
|
|
|
|
if not deps:
|
|
|
|
deps = [None for _ in words]
|
2019-11-28 13:10:07 +03:00
|
|
|
if not sent_starts:
|
|
|
|
sent_starts = [None for _ in words]
|
2019-10-22 17:54:33 +03:00
|
|
|
if entities is None:
|
2019-10-27 15:50:07 +03:00
|
|
|
entities = ["-" for _ in words]
|
2019-10-22 17:54:33 +03:00
|
|
|
elif len(entities) == 0:
|
2019-10-27 15:50:07 +03:00
|
|
|
entities = ["O" for _ in words]
|
2015-05-24 03:49:56 +03:00
|
|
|
else:
|
2019-10-22 17:54:33 +03:00
|
|
|
# Translate the None values to '-', to make processing easier.
|
|
|
|
# See Issue #2603
|
|
|
|
entities = [(ent if ent is not None else "-") for ent in entities]
|
2019-12-22 03:53:56 +03:00
|
|
|
if not isinstance(entities[0], str):
|
2019-10-22 17:54:33 +03:00
|
|
|
# Assume we have entities specified by character offset.
|
|
|
|
entities = biluo_tags_from_offsets(doc, entities)
|
|
|
|
|
|
|
|
# These are filled by the tagger/parser/entity recogniser
|
|
|
|
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
|
|
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
|
|
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
|
|
|
|
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
|
|
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
|
|
|
|
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
|
|
|
|
|
|
|
|
self.words = [None] * len(doc)
|
|
|
|
self.tags = [None] * len(doc)
|
2020-01-28 13:36:29 +03:00
|
|
|
self.pos = [None] * len(doc)
|
|
|
|
self.morphs = [None] * len(doc)
|
|
|
|
self.lemmas = [None] * len(doc)
|
2019-10-22 17:54:33 +03:00
|
|
|
self.heads = [None] * len(doc)
|
|
|
|
self.labels = [None] * len(doc)
|
|
|
|
self.ner = [None] * len(doc)
|
2019-11-28 13:10:07 +03:00
|
|
|
self.sent_starts = [None] * len(doc)
|
2019-10-22 17:54:33 +03:00
|
|
|
|
|
|
|
# This needs to be done before we align the words
|
2020-05-20 12:41:12 +03:00
|
|
|
if make_projective and any(heads) and any(deps) :
|
2019-10-22 17:54:33 +03:00
|
|
|
heads, deps = nonproj.projectivize(heads, deps)
|
|
|
|
|
|
|
|
# Do many-to-one alignment for misaligned tokens.
|
|
|
|
# If we over-segment, we'll have one gold word that covers a sequence
|
|
|
|
# of predicted words
|
|
|
|
# If we under-segment, we'll have one predicted word that covers a
|
|
|
|
# sequence of gold words.
|
|
|
|
# If we "mis-segment", we'll have a sequence of predicted words covering
|
|
|
|
# a sequence of gold words. That's many-to-many -- we don't do that.
|
|
|
|
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
|
|
|
|
|
|
|
|
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
|
|
|
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
|
|
|
|
2019-11-28 16:53:44 +03:00
|
|
|
self.orig = TokenAnnotation(ids=list(range(len(words))),
|
2020-01-28 13:36:29 +03:00
|
|
|
words=words, tags=tags, pos=pos, morphs=morphs,
|
|
|
|
lemmas=lemmas, heads=heads, deps=deps, entities=entities,
|
2019-11-28 16:53:44 +03:00
|
|
|
sent_starts=sent_starts, brackets=[])
|
2019-10-22 17:54:33 +03:00
|
|
|
|
|
|
|
for i, gold_i in enumerate(self.cand_to_gold):
|
|
|
|
if doc[i].text.isspace():
|
|
|
|
self.words[i] = doc[i].text
|
|
|
|
self.tags[i] = "_SP"
|
2020-01-28 13:36:29 +03:00
|
|
|
self.pos[i] = "SPACE"
|
|
|
|
self.morphs[i] = None
|
|
|
|
self.lemmas[i] = None
|
2017-05-26 01:15:09 +03:00
|
|
|
self.heads[i] = None
|
2019-10-22 17:54:33 +03:00
|
|
|
self.labels[i] = None
|
|
|
|
self.ner[i] = None
|
2019-11-28 13:10:07 +03:00
|
|
|
self.sent_starts[i] = 0
|
2019-10-22 17:54:33 +03:00
|
|
|
if gold_i is None:
|
|
|
|
if i in i2j_multi:
|
|
|
|
self.words[i] = words[i2j_multi[i]]
|
|
|
|
self.tags[i] = tags[i2j_multi[i]]
|
2020-01-28 13:36:29 +03:00
|
|
|
self.pos[i] = pos[i2j_multi[i]]
|
2019-11-25 18:03:28 +03:00
|
|
|
self.morphs[i] = morphs[i2j_multi[i]]
|
2019-11-28 16:53:44 +03:00
|
|
|
self.lemmas[i] = lemmas[i2j_multi[i]]
|
2019-11-28 13:10:07 +03:00
|
|
|
self.sent_starts[i] = sent_starts[i2j_multi[i]]
|
2019-10-22 17:54:33 +03:00
|
|
|
is_last = i2j_multi[i] != i2j_multi.get(i+1)
|
|
|
|
is_first = i2j_multi[i] != i2j_multi.get(i-1)
|
|
|
|
# Set next word in multi-token span as head, until last
|
|
|
|
if not is_last:
|
|
|
|
self.heads[i] = i+1
|
|
|
|
self.labels[i] = "subtok"
|
|
|
|
else:
|
2019-10-27 15:50:07 +03:00
|
|
|
head_i = heads[i2j_multi[i]]
|
|
|
|
if head_i:
|
|
|
|
self.heads[i] = self.gold_to_cand[head_i]
|
2019-10-22 17:54:33 +03:00
|
|
|
self.labels[i] = deps[i2j_multi[i]]
|
|
|
|
# Now set NER...This is annoying because if we've split
|
|
|
|
# got an entity word split into two, we need to adjust the
|
|
|
|
# BILUO tags. We can't have BB or LL etc.
|
|
|
|
# Case 1: O -- easy.
|
|
|
|
ner_tag = entities[i2j_multi[i]]
|
|
|
|
if ner_tag == "O":
|
|
|
|
self.ner[i] = "O"
|
|
|
|
# Case 2: U. This has to become a B I* L sequence.
|
|
|
|
elif ner_tag.startswith("U-"):
|
|
|
|
if is_first:
|
|
|
|
self.ner[i] = ner_tag.replace("U-", "B-", 1)
|
|
|
|
elif is_last:
|
|
|
|
self.ner[i] = ner_tag.replace("U-", "L-", 1)
|
|
|
|
else:
|
|
|
|
self.ner[i] = ner_tag.replace("U-", "I-", 1)
|
|
|
|
# Case 3: L. If not last, change to I.
|
|
|
|
elif ner_tag.startswith("L-"):
|
|
|
|
if is_last:
|
|
|
|
self.ner[i] = ner_tag
|
|
|
|
else:
|
|
|
|
self.ner[i] = ner_tag.replace("L-", "I-", 1)
|
|
|
|
# Case 4: I. Stays correct
|
|
|
|
elif ner_tag.startswith("I-"):
|
|
|
|
self.ner[i] = ner_tag
|
2017-05-26 01:15:09 +03:00
|
|
|
else:
|
2019-10-22 17:54:33 +03:00
|
|
|
self.words[i] = words[gold_i]
|
|
|
|
self.tags[i] = tags[gold_i]
|
2020-01-28 13:36:29 +03:00
|
|
|
self.pos[i] = pos[gold_i]
|
2019-11-25 18:03:28 +03:00
|
|
|
self.morphs[i] = morphs[gold_i]
|
2019-11-28 16:53:44 +03:00
|
|
|
self.lemmas[i] = lemmas[gold_i]
|
2019-11-28 13:10:07 +03:00
|
|
|
self.sent_starts[i] = sent_starts[gold_i]
|
2019-10-22 17:54:33 +03:00
|
|
|
if heads[gold_i] is None:
|
|
|
|
self.heads[i] = None
|
|
|
|
else:
|
|
|
|
self.heads[i] = self.gold_to_cand[heads[gold_i]]
|
|
|
|
self.labels[i] = deps[gold_i]
|
|
|
|
self.ner[i] = entities[gold_i]
|
|
|
|
|
|
|
|
# Prevent whitespace that isn't within entities from being tagged as
|
|
|
|
# an entity.
|
|
|
|
for i in range(len(self.ner)):
|
|
|
|
if self.tags[i] == "_SP":
|
|
|
|
prev_ner = self.ner[i-1] if i >= 1 else None
|
|
|
|
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
|
|
|
|
if prev_ner == "O" or next_ner == "O":
|
|
|
|
self.ner[i] = "O"
|
|
|
|
|
|
|
|
cycle = nonproj.contains_cycle(self.heads)
|
|
|
|
if cycle is not None:
|
|
|
|
raise ValueError(Errors.E069.format(cycle=cycle,
|
2019-12-22 03:53:56 +03:00
|
|
|
cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]),
|
2019-10-22 17:54:33 +03:00
|
|
|
doc_tokens=" ".join(words[:50])))
|
2016-02-22 16:40:40 +03:00
|
|
|
|
2015-03-10 20:00:23 +03:00
|
|
|
def __len__(self):
|
2017-05-21 14:53:46 +03:00
|
|
|
"""Get the number of gold-standard tokens.
|
2017-03-15 17:29:42 +03:00
|
|
|
|
2017-05-21 14:53:46 +03:00
|
|
|
RETURNS (int): The number of gold-standard tokens.
|
2016-11-01 14:25:36 +03:00
|
|
|
"""
|
2015-03-10 20:00:23 +03:00
|
|
|
return self.length
|
2015-03-09 08:46:22 +03:00
|
|
|
|
2015-05-30 02:25:46 +03:00
|
|
|
@property
|
|
|
|
def is_projective(self):
|
2017-05-21 14:53:46 +03:00
|
|
|
"""Whether the provided syntactic annotations form a projective
|
|
|
|
dependency tree.
|
2017-04-15 12:59:21 +03:00
|
|
|
"""
|
2016-02-24 13:26:25 +03:00
|
|
|
return not nonproj.is_nonproj_tree(self.heads)
|
2015-05-30 02:25:46 +03:00
|
|
|
|
2015-02-21 19:06:58 +03:00
|
|
|
|
2019-12-21 15:47:21 +03:00
|
|
|
def docs_to_json(docs, id=0, ner_missing_tag="O"):
|
2018-11-30 22:16:14 +03:00
|
|
|
"""Convert a list of Doc objects into the JSON-serializable format used by
|
|
|
|
the spacy train command.
|
|
|
|
|
|
|
|
docs (iterable / Doc): The Doc object(s) to convert.
|
2019-03-18 00:12:54 +03:00
|
|
|
id (int): Id for the JSON.
|
2019-10-28 14:36:23 +03:00
|
|
|
RETURNS (dict): The data in spaCy's JSON format
|
2019-10-17 00:17:58 +03:00
|
|
|
- each input doc will be treated as a paragraph in the output doc
|
2018-11-30 22:16:14 +03:00
|
|
|
"""
|
2018-08-14 14:13:10 +03:00
|
|
|
if isinstance(docs, Doc):
|
|
|
|
docs = [docs]
|
2019-03-18 00:12:54 +03:00
|
|
|
json_doc = {"id": id, "paragraphs": []}
|
|
|
|
for i, doc in enumerate(docs):
|
2019-09-15 23:31:31 +03:00
|
|
|
json_para = {'raw': doc.text, "sentences": [], "cats": []}
|
|
|
|
for cat, val in doc.cats.items():
|
|
|
|
json_cat = {"label": cat, "value": val}
|
|
|
|
json_para["cats"].append(json_cat)
|
2019-03-18 00:12:54 +03:00
|
|
|
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
2019-12-21 15:47:21 +03:00
|
|
|
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
|
2019-03-18 00:12:54 +03:00
|
|
|
for j, sent in enumerate(doc.sents):
|
|
|
|
json_sent = {"tokens": [], "brackets": []}
|
|
|
|
for token in sent:
|
|
|
|
json_token = {"id": token.i, "orth": token.text}
|
|
|
|
if doc.is_tagged:
|
|
|
|
json_token["tag"] = token.tag_
|
2020-01-28 13:36:29 +03:00
|
|
|
json_token["pos"] = token.pos_
|
|
|
|
json_token["morph"] = token.morph_
|
|
|
|
json_token["lemma"] = token.lemma_
|
2019-03-18 00:12:54 +03:00
|
|
|
if doc.is_parsed:
|
|
|
|
json_token["head"] = token.head.i-token.i
|
|
|
|
json_token["dep"] = token.dep_
|
|
|
|
json_token["ner"] = biluo_tags[token.i]
|
|
|
|
json_sent["tokens"].append(json_token)
|
|
|
|
json_para["sentences"].append(json_sent)
|
|
|
|
json_doc["paragraphs"].append(json_para)
|
2019-04-08 13:53:16 +03:00
|
|
|
return json_doc
|
2018-08-14 14:13:10 +03:00
|
|
|
|
|
|
|
|
2019-03-08 13:42:26 +03:00
|
|
|
def biluo_tags_from_offsets(doc, entities, missing="O"):
|
2017-10-27 18:02:55 +03:00
|
|
|
"""Encode labelled spans into per-token tags, using the
|
|
|
|
Begin/In/Last/Unit/Out scheme (BILUO).
|
2017-05-21 14:53:46 +03:00
|
|
|
|
|
|
|
doc (Doc): The document that the entity offsets refer to. The output tags
|
|
|
|
will refer to the token boundaries within the document.
|
2017-10-27 18:02:55 +03:00
|
|
|
entities (iterable): A sequence of `(start, end, label)` triples. `start`
|
|
|
|
and `end` should be character-offset integers denoting the slice into
|
|
|
|
the original string.
|
2017-05-21 14:53:46 +03:00
|
|
|
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
|
|
|
string will be of the form either "", "O" or "{action}-{label}", where
|
|
|
|
action is one of "B", "I", "L", "U". The string "-" is used where the
|
2017-10-27 18:02:55 +03:00
|
|
|
entity offsets don't align with the tokenization in the `Doc` object.
|
|
|
|
The training algorithm will view these as missing values. "O" denotes a
|
2017-05-21 14:53:46 +03:00
|
|
|
non-entity token. "B" denotes the beginning of a multi-token entity,
|
|
|
|
"I" the inside of an entity of three or more tokens, and "L" the end
|
|
|
|
of an entity of two or more tokens. "U" denotes a single-token entity.
|
|
|
|
|
|
|
|
EXAMPLE:
|
|
|
|
>>> text = 'I like London.'
|
|
|
|
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
|
|
|
|
>>> doc = nlp.tokenizer(text)
|
|
|
|
>>> tags = biluo_tags_from_offsets(doc, entities)
|
2019-03-08 13:42:26 +03:00
|
|
|
>>> assert tags == ["O", "O", 'U-LOC', "O"]
|
2017-04-15 12:59:21 +03:00
|
|
|
"""
|
2019-08-15 19:13:32 +03:00
|
|
|
# Ensure no overlapping entity labels exist
|
|
|
|
tokens_in_ents = {}
|
2019-10-28 14:36:23 +03:00
|
|
|
|
2016-10-15 22:51:04 +03:00
|
|
|
starts = {token.idx: token.i for token in doc}
|
2019-03-08 13:42:26 +03:00
|
|
|
ends = {token.idx + len(token): token.i for token in doc}
|
|
|
|
biluo = ["-" for _ in doc]
|
2016-10-15 22:51:04 +03:00
|
|
|
# Handle entity cases
|
|
|
|
for start_char, end_char, label in entities:
|
2019-08-15 19:13:32 +03:00
|
|
|
for token_index in range(start_char, end_char):
|
|
|
|
if token_index in tokens_in_ents.keys():
|
|
|
|
raise ValueError(Errors.E103.format(
|
|
|
|
span1=(tokens_in_ents[token_index][0],
|
|
|
|
tokens_in_ents[token_index][1],
|
|
|
|
tokens_in_ents[token_index][2]),
|
|
|
|
span2=(start_char, end_char, label)))
|
|
|
|
tokens_in_ents[token_index] = (start_char, end_char, label)
|
|
|
|
|
2016-10-15 22:51:04 +03:00
|
|
|
start_token = starts.get(start_char)
|
|
|
|
end_token = ends.get(end_char)
|
|
|
|
# Only interested if the tokenization is correct
|
|
|
|
if start_token is not None and end_token is not None:
|
|
|
|
if start_token == end_token:
|
2019-12-25 19:59:52 +03:00
|
|
|
biluo[start_token] = f"U-{label}"
|
2016-10-15 22:51:04 +03:00
|
|
|
else:
|
2019-12-25 19:59:52 +03:00
|
|
|
biluo[start_token] = f"B-{label}"
|
2016-10-15 22:51:04 +03:00
|
|
|
for i in range(start_token+1, end_token):
|
2019-12-25 19:59:52 +03:00
|
|
|
biluo[i] = f"I-{label}"
|
|
|
|
biluo[end_token] = f"L-{label}"
|
2016-10-15 22:51:04 +03:00
|
|
|
# Now distinguish the O cases from ones where we miss the tokenization
|
|
|
|
entity_chars = set()
|
|
|
|
for start_char, end_char, label in entities:
|
|
|
|
for i in range(start_char, end_char):
|
|
|
|
entity_chars.add(i)
|
|
|
|
for token in doc:
|
2019-03-08 13:42:26 +03:00
|
|
|
for i in range(token.idx, token.idx + len(token)):
|
2016-10-15 22:51:04 +03:00
|
|
|
if i in entity_chars:
|
|
|
|
break
|
|
|
|
else:
|
2017-07-29 22:58:37 +03:00
|
|
|
biluo[token.i] = missing
|
2016-10-15 22:51:04 +03:00
|
|
|
return biluo
|
|
|
|
|
|
|
|
|
2019-02-06 13:50:26 +03:00
|
|
|
def spans_from_biluo_tags(doc, tags):
|
|
|
|
"""Encode per-token tags following the BILUO scheme into Span object, e.g.
|
|
|
|
to overwrite the doc.ents.
|
|
|
|
|
|
|
|
doc (Doc): The document that the BILUO tags refer to.
|
|
|
|
entities (iterable): A sequence of BILUO tags with each tag describing one
|
|
|
|
token. Each tags string will be of the form of either "", "O" or
|
|
|
|
"{action}-{label}", where action is one of "B", "I", "L", "U".
|
|
|
|
RETURNS (list): A sequence of Span objects.
|
|
|
|
"""
|
|
|
|
token_offsets = tags_to_entities(tags)
|
|
|
|
spans = []
|
|
|
|
for label, start_idx, end_idx in token_offsets:
|
|
|
|
span = Span(doc, start_idx, end_idx + 1, label=label)
|
|
|
|
spans.append(span)
|
|
|
|
return spans
|
|
|
|
|
|
|
|
|
2017-11-26 18:38:01 +03:00
|
|
|
def offsets_from_biluo_tags(doc, tags):
|
|
|
|
"""Encode per-token tags following the BILUO scheme into entity offsets.
|
|
|
|
|
|
|
|
doc (Doc): The document that the BILUO tags refer to.
|
|
|
|
entities (iterable): A sequence of BILUO tags with each tag describing one
|
|
|
|
token. Each tags string will be of the form of either "", "O" or
|
|
|
|
"{action}-{label}", where action is one of "B", "I", "L", "U".
|
|
|
|
RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
|
|
|
|
`end` will be character-offset integers denoting the slice into the
|
|
|
|
original string.
|
|
|
|
"""
|
2019-02-06 13:50:26 +03:00
|
|
|
spans = spans_from_biluo_tags(doc, tags)
|
|
|
|
return [(span.start_char, span.end_char, span.label_) for span in spans]
|
2017-11-26 18:38:01 +03:00
|
|
|
|
|
|
|
|
2015-02-21 19:06:58 +03:00
|
|
|
def is_punct_label(label):
|
2019-03-08 13:42:26 +03:00
|
|
|
return label == "P" or label.lower() == "punct"
|