spaCy/spacy/gold.pyx

1231 lines
50 KiB
Cython
Raw Normal View History

2017-03-15 17:29:42 +03:00
# cython: profile=True
# coding: utf8
from __future__ import unicode_literals, print_function
import re
import random
import numpy
import tempfile
import shutil
import itertools
from pathlib import Path
import srsly
from .syntax import nonproj
from .tokens import Doc, Span
2019-10-28 14:36:23 +03:00
from .errors import Errors, AlignmentError
from .compat import path2str, basestring_
2017-10-27 22:07:59 +03:00
from . import util
2015-02-21 19:06:58 +03:00
USE_NEW_ALIGN = False
punct_re = re.compile(r"\W")
def tags_to_entities(tags):
entities = []
start = None
for i, tag in enumerate(tags):
2016-11-25 17:57:59 +03:00
if tag is None:
continue
if tag.startswith("O"):
# TODO: We shouldn't be getting these malformed inputs. Fix this.
if start is not None:
start = None
continue
elif tag == "-":
continue
elif tag.startswith("I"):
if start is None:
raise ValueError(Errors.E067.format(tags=tags[:i + 1]))
continue
if tag.startswith("U"):
entities.append((tag[2:], i, i))
elif tag.startswith("B"):
start = i
elif tag.startswith("L"):
entities.append((tag[2:], start, i))
start = None
else:
raise ValueError(Errors.E068.format(tag=tag))
return entities
_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
def _normalize_for_alignment(tokens):
tokens = [w.replace(" ", "").lower() for w in tokens]
output = []
for token in tokens:
token = token.replace(" ", "").lower()
for before, after in _ALIGNMENT_NORM_MAP:
token = token.replace(before, after)
output.append(token)
return output
def _align_before_v2_2_2(tokens_a, tokens_b):
"""Calculate alignment tables between two tokenizations, using the Levenshtein
algorithm. The alignment is case-insensitive.
tokens_a (List[str]): The candidate tokenization.
tokens_b (List[str]): The reference tokenization.
RETURNS: (tuple): A 5-tuple consisting of the following information:
* cost (int): The number of misaligned tokens.
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
it has the value -1.
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
the same token of `tokens_b`.
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
direction.
"""
from . import _align
if tokens_a == tokens_b:
alignment = numpy.arange(len(tokens_a))
return 0, alignment, alignment, {}, {}
tokens_a = [w.replace(" ", "").lower() for w in tokens_a]
tokens_b = [w.replace(" ", "").lower() for w in tokens_b]
cost, i2j, j2i, matrix = _align.align(tokens_a, tokens_b)
i2j_multi, j2i_multi = _align.multi_align(i2j, j2i, [len(w) for w in tokens_a],
[len(w) for w in tokens_b])
for i, j in list(i2j_multi.items()):
if i2j_multi.get(i+1) != j and i2j_multi.get(i-1) != j:
i2j[i] = j
i2j_multi.pop(i)
for j, i in list(j2i_multi.items()):
if j2i_multi.get(j+1) != i and j2i_multi.get(j-1) != i:
j2i[j] = i
j2i_multi.pop(j)
return cost, i2j, j2i, i2j_multi, j2i_multi
2019-07-17 15:29:52 +03:00
def align(tokens_a, tokens_b):
"""Calculate alignment tables between two tokenizations.
2019-07-17 14:59:17 +03:00
2019-07-17 15:29:52 +03:00
tokens_a (List[str]): The candidate tokenization.
tokens_b (List[str]): The reference tokenization.
2019-07-17 14:59:17 +03:00
RETURNS: (tuple): A 5-tuple consisting of the following information:
* cost (int): The number of misaligned tokens.
2019-07-17 15:29:52 +03:00
* a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
to `tokens_b[6]`. If there's no one-to-one alignment for a token,
it has the value -1.
2019-07-17 14:59:17 +03:00
* b2a (List[int]): The same as `a2b`, but mapping the other direction.
2019-07-17 15:29:52 +03:00
* a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
the same token of `tokens_b`.
2019-07-17 14:59:17 +03:00
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
direction.
"""
if not USE_NEW_ALIGN:
return _align_before_v2_2_2(tokens_a, tokens_b)
tokens_a = _normalize_for_alignment(tokens_a)
tokens_b = _normalize_for_alignment(tokens_b)
cost = 0
a2b = numpy.empty(len(tokens_a), dtype="i")
b2a = numpy.empty(len(tokens_b), dtype="i")
a2b_multi = {}
b2a_multi = {}
i = 0
j = 0
offset_a = 0
offset_b = 0
while i < len(tokens_a) and j < len(tokens_b):
a = tokens_a[i][offset_a:]
b = tokens_b[j][offset_b:]
a2b[i] = b2a[j] = -1
if a == b:
if offset_a == offset_b == 0:
a2b[i] = j
b2a[j] = i
elif offset_a == 0:
cost += 2
a2b_multi[i] = j
elif offset_b == 0:
cost += 2
b2a_multi[j] = i
offset_a = offset_b = 0
i += 1
j += 1
elif a == "":
assert offset_a == 0
cost += 1
i += 1
elif b == "":
assert offset_b == 0
cost += 1
j += 1
elif b.startswith(a):
cost += 1
if offset_a == 0:
a2b_multi[i] = j
i += 1
offset_a = 0
offset_b += len(a)
elif a.startswith(b):
cost += 1
if offset_b == 0:
b2a_multi[j] = i
j += 1
offset_b = 0
offset_a += len(b)
else:
assert "".join(tokens_a) != "".join(tokens_b)
2019-10-28 14:36:23 +03:00
raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
return cost, a2b, b2a, a2b_multi, b2a_multi
2015-05-27 20:13:11 +03:00
class GoldCorpus(object):
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER.
DOCS: https://spacy.io/api/goldcorpus
"""
def __init__(self, train, dev, gold_preproc=False, limit=None):
"""Create a GoldCorpus.
train (unicode or Path): File or directory of training data.
dev (unicode or Path): File or directory of development data.
2017-10-27 18:02:55 +03:00
RETURNS (GoldCorpus): The newly created object.
"""
2017-05-22 18:40:46 +03:00
self.limit = limit
if isinstance(train, str) or isinstance(train, Path):
train = self.read_examples(self.walk_corpus(train))
dev = self.read_examples(self.walk_corpus(dev))
# Write temp directory with one doc per file, so we can shuffle and stream
self.tmp_dir = Path(tempfile.mkdtemp())
self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit)
def __del__(self):
shutil.rmtree(path2str(self.tmp_dir))
@staticmethod
def write_msgpack(directory, examples, limit=0):
if not directory.exists():
directory.mkdir()
n = 0
for i, example in enumerate(examples):
ex_dict = example.to_dict()
text = example.text
srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
n += len(example.token_annotations)
if limit and n >= limit:
break
@staticmethod
def walk_corpus(path):
path = util.ensure_path(path)
if not path.is_dir():
return [path]
paths = [path]
locs = []
seen = set()
for path in paths:
if str(path) in seen:
continue
seen.add(str(path))
if path.parts[-1].startswith("."):
continue
elif path.is_dir():
paths.extend(path.iterdir())
elif path.parts[-1].endswith((".json", ".jsonl")):
locs.append(path)
return locs
@staticmethod
def read_examples(locs, limit=0):
""" Yield training examples """
2017-05-22 18:40:46 +03:00
i = 0
for loc in locs:
loc = util.ensure_path(loc)
if loc.parts[-1].endswith("json"):
examples = read_json_file(loc)
elif loc.parts[-1].endswith("jsonl"):
gold_tuples = srsly.read_jsonl(loc)
first_gold_tuple = next(gold_tuples)
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
# TODO: proper format checks with schemas
if isinstance(first_gold_tuple, dict):
if first_gold_tuple.get("paragraphs", None):
examples = read_json_object(gold_tuples)
elif first_gold_tuple.get("doc_annotation", None):
examples = []
for ex_dict in gold_tuples:
doc = ex_dict.get("doc", None)
if doc is None:
doc = ex_dict.get("text", None)
examples.append(Example.from_dict(ex_dict, doc=doc))
elif loc.parts[-1].endswith("msg"):
text, ex_dict = srsly.read_msgpack(loc)
examples = [Example.from_dict(ex_dict, doc=text)]
else:
supported = ("json", "jsonl", "msg")
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
for example in examples:
yield example
i += len(example.token_annotations)
if limit and i >= limit:
return
@property
def dev_examples(self):
locs = (self.tmp_dir / "dev").iterdir()
yield from self.read_examples(locs, limit=self.limit)
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
@property
def train_examples(self):
locs = (self.tmp_dir / "train").iterdir()
yield from self.read_examples(locs, limit=self.limit)
def count_train(self):
# TODO: should this count words or sentences ?
n = 0
2017-06-05 04:18:20 +03:00
i = 0
for example in self.train_examples:
for token_annotation in example.token_annotations:
n += len(token_annotation.words)
if self.limit and i >= self.limit:
break
i += 1
return n
def train_dataset(self, nlp, gold_preproc=False, max_length=None,
noise_level=0.0, orth_variant_level=0.0,
ignore_misaligned=False):
locs = list((self.tmp_dir / 'train').iterdir())
random.shuffle(locs)
train_examples = self.read_examples(locs, limit=self.limit)
gold_examples = self.iter_gold_docs(nlp, train_examples, gold_preproc,
2017-06-05 04:16:57 +03:00
max_length=max_length,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
make_projective=True,
ignore_misaligned=ignore_misaligned)
yield from gold_examples
def train_dataset_without_preprocessing(self, nlp, gold_preproc=False):
examples = self.iter_gold_docs(nlp, self.train_examples, gold_preproc=gold_preproc)
yield from examples
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
examples = self.iter_gold_docs(nlp, self.dev_examples, gold_preproc=gold_preproc,
ignore_misaligned=ignore_misaligned)
yield from examples
@classmethod
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
noise_level=0.0, orth_variant_level=0.0, make_projective=False,
ignore_misaligned=False):
""" Setting gold_preproc will result in creating a doc per 'sentence' """
for example in examples:
if gold_preproc:
example.doc = None
else:
example = example.merge_sents()
example.make_projective = make_projective
example.ignore_misaligned = ignore_misaligned
examples = cls._make_docs(nlp, example,
gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level)
examples = cls._make_golds(examples, vocab=nlp.vocab)
for ex in examples:
if ex.gold is not None:
if (not max_length) or len(ex.doc) < max_length:
yield ex
@classmethod
def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
# gold_preproc is not used ?!
if example.text is not None:
var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
var_text = add_noise(var_example.text, noise_level)
var_doc = nlp.make_doc(var_text)
var_example.doc = var_doc
return [var_example]
else:
var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
doc_examples = []
for token_annotation in var_example.token_annotations:
t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
doc_example = Example(doc_annotation=example.doc_annotation,
token_annotations=[token_annotation],
doc=t_doc)
doc_examples.append(doc_example)
return doc_examples
@classmethod
def _make_golds(cls, examples, vocab=None):
gold_examples = []
for example in examples:
gold_parses = example.get_gold_parses(vocab=vocab)
for (doc, gold) in gold_parses:
ex = Example(doc=doc)
ex.goldparse = gold
gold_examples.append(ex)
return gold_examples
def make_orth_variants(nlp, example, orth_variant_level=0.0):
if random.random() >= orth_variant_level:
return example
if not example.token_annotations:
return example
raw = example.text
2019-09-18 22:54:51 +03:00
if random.random() >= 0.5:
lower = True
2019-09-19 01:03:24 +03:00
if raw is not None:
raw = raw.lower()
ndsv = nlp.Defaults.single_orth_variants
ndpv = nlp.Defaults.paired_orth_variants
# modify words in paragraph_tuples
variant_example = Example(doc=raw)
for token_annotation in example.token_annotations:
words = token_annotation.words
tags = token_annotation.tags
if not words or not tags:
# add the unmodified annotation
token_dict = token_annotation.to_dict()
variant_example.add_token_annotation(**token_dict)
else:
if lower:
words = [w.lower() for w in words]
# single variants
punct_choices = [random.choice(x["variants"]) for x in ndsv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndsv)):
if tags[word_idx] in ndsv[punct_idx]["tags"] \
and words[word_idx] in ndsv[punct_idx]["variants"]:
words[word_idx] = punct_choices[punct_idx]
# paired variants
punct_choices = [random.choice(x["variants"]) for x in ndpv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndpv)):
if tags[word_idx] in ndpv[punct_idx]["tags"] \
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
# backup option: random left vs. right from pair
pair_idx = random.choice([0, 1])
# best option: rely on paired POS tags like `` / ''
if len(ndpv[punct_idx]["tags"]) == 2:
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
# next best option: rely on position in variants
# (may not be unambiguous, so order of variants matters)
else:
for pair in ndpv[punct_idx]["variants"]:
if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx]
token_dict = token_annotation.to_dict()
token_dict["words"] = words
token_dict["tags"] = tags
variant_example.add_token_annotation(**token_dict)
# modify raw to match variant_paragraph_tuples
if raw is not None:
variants = []
for single_variants in ndsv:
variants.extend(single_variants["variants"])
for paired_variants in ndpv:
variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
# store variants in reverse length order to be able to prioritize
# longer matches (e.g., "---" before "--")
variants = sorted(variants, key=lambda x: len(x))
variants.reverse()
variant_raw = ""
raw_idx = 0
# add initial whitespace
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
for token_annotation in variant_example.token_annotations:
for word in token_annotation.words:
match_found = False
# add identical word
if word not in variants and raw[raw_idx:].startswith(word):
variant_raw += word
raw_idx += len(word)
match_found = True
# add variant word
else:
for variant in variants:
if not match_found and \
raw[raw_idx:].startswith(variant):
raw_idx += len(variant)
variant_raw += word
match_found = True
# something went wrong, abort
# (add a warning message?)
if not match_found:
return example
# add following whitespace
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
variant_example.doc = variant_raw
return variant_example
return variant_example
2017-06-05 04:16:57 +03:00
def add_noise(orig, noise_level):
if random.random() >= noise_level:
return orig
elif type(orig) == list:
2019-08-29 16:39:32 +03:00
corrupted = [_corrupt(word, noise_level) for word in orig]
2017-06-05 04:16:57 +03:00
corrupted = [w for w in corrupted if w]
return corrupted
else:
2019-08-29 16:39:32 +03:00
return "".join(_corrupt(c, noise_level) for c in orig)
2017-06-05 04:16:57 +03:00
2019-08-29 16:39:32 +03:00
def _corrupt(c, noise_level):
2017-06-05 04:16:57 +03:00
if random.random() >= noise_level:
return c
elif c in [".", "'", "!", "?", ","]:
2019-08-29 16:39:32 +03:00
return "\n"
2017-06-05 04:16:57 +03:00
else:
return c.lower()
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
def read_json_object(json_corpus_section):
"""Take a list of JSON-formatted documents (e.g. from an already loaded
training data file) and yield annotations in the GoldParse format.
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
json_corpus_section (list): The data.
YIELDS (Example): The reformatted data - one training example per paragraph
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
"""
for json_doc in json_corpus_section:
examples = json_to_examples(json_doc)
for ex in examples:
yield ex
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
def json_to_examples(doc):
"""Convert an item in the JSON-formatted training data to the format
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
used by GoldParse.
doc (dict): One entry in the training data.
YIELDS (Example): The reformatted data - one training example per paragraph
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
"""
paragraphs = []
for paragraph in doc["paragraphs"]:
example = Example(doc=paragraph.get("raw", None))
for sent in paragraph["sentences"]:
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
words = []
ids = []
tags = []
heads = []
labels = []
ner = []
for i, token in enumerate(sent["tokens"]):
words.append(token["orth"])
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
ids.append(i)
tags.append(token.get('tag', "-"))
heads.append(token.get("head", 0) + i)
labels.append(token.get("dep", ""))
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
# Ensure ROOT label is case-insensitive
if labels[-1].lower() == "root":
labels[-1] = "ROOT"
ner.append(token.get("ner", "-"))
example.add_token_annotation(ids=ids, words=words, tags=tags,
heads=heads, deps=labels, entities=ner,
brackets=sent.get("brackets", []))
cats = {}
for cat in paragraph.get("cats", {}):
cats[cat["label"]] = cat["value"]
example.add_doc_annotation(cats=cats)
yield example
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
def read_json_file(loc, docs_filter=None, limit=None):
2017-10-27 22:07:59 +03:00
loc = util.ensure_path(loc)
2017-04-15 13:13:00 +03:00
if loc.is_dir():
for filename in loc.iterdir():
yield from read_json_file(loc / filename, limit=limit)
else:
for doc in _json_iterate(loc):
if docs_filter is not None and not docs_filter(doc):
continue
for json_data in json_to_examples(doc):
yield json_data
2015-05-06 17:27:31 +03:00
def _json_iterate(loc):
# We should've made these files jsonl...But since we didn't, parse out
# the docs one-by-one to reduce memory usage.
# It's okay to read in the whole file -- just don't parse it into JSON.
cdef bytes py_raw
loc = util.ensure_path(loc)
with loc.open("rb") as file_:
py_raw = file_.read()
raw = <char*>py_raw
cdef int square_depth = 0
cdef int curly_depth = 0
cdef int inside_string = 0
cdef int escape = 0
cdef int start = -1
cdef char c
cdef char quote = ord('"')
cdef char backslash = ord("\\")
cdef char open_square = ord("[")
cdef char close_square = ord("]")
cdef char open_curly = ord("{")
cdef char close_curly = ord("}")
for i in range(len(py_raw)):
c = raw[i]
if escape:
escape = False
continue
if c == backslash:
escape = True
continue
if c == quote:
inside_string = not inside_string
continue
if inside_string:
continue
if c == open_square:
square_depth += 1
elif c == close_square:
square_depth -= 1
elif c == open_curly:
if square_depth == 1 and curly_depth == 0:
start = i
curly_depth += 1
elif c == close_curly:
curly_depth -= 1
if square_depth == 1 and curly_depth == 0:
py_str = py_raw[start : i + 1].decode("utf8")
2018-08-14 15:03:48 +03:00
try:
yield srsly.json_loads(py_str)
except Exception:
2018-08-14 15:03:48 +03:00
print(py_str)
raise
start = -1
2017-05-26 19:32:55 +03:00
def iob_to_biluo(tags):
out = []
tags = list(tags)
while tags:
out.extend(_consume_os(tags))
out.extend(_consume_ent(tags))
return out
def _consume_os(tags):
while tags and tags[0] == "O":
yield tags.pop(0)
def _consume_ent(tags):
if not tags:
return []
tag = tags.pop(0)
target_in = "I" + tag[1:]
target_last = "L" + tag[1:]
length = 1
while tags and tags[0] in {target_in, target_last}:
length += 1
tags.pop(0)
label = tag[2:]
if length == 1:
if len(label) == 0:
raise ValueError(Errors.E177.format(tag=tag))
return ["U-" + label]
else:
start = "B-" + label
end = "L-" + label
middle = ["I-%s" % label for _ in range(1, length - 1)]
return [start] + middle + [end]
cdef class TokenAnnotation:
def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None):
self.ids = ids if ids else []
self.words = words if words else []
self.tags = tags if tags else []
self.heads = heads if heads else []
self.deps = deps if deps else []
self.entities = entities if entities else []
self.brackets = brackets if brackets else []
self.morphology = morphology if morphology else []
@classmethod
def from_dict(cls, token_dict):
return cls(ids=token_dict.get("ids", None),
words=token_dict.get("words", None),
tags=token_dict.get("tags", None),
heads=token_dict.get("heads", None),
deps=token_dict.get("deps", None),
entities=token_dict.get("entities", None),
morphology=token_dict.get("morphology", None),
brackets=token_dict.get("brackets", None))
def to_dict(self):
return {"ids": self.ids,
"words": self.words,
"tags": self.tags,
"heads": self.heads,
"deps": self.deps,
"entities": self.entities,
"morphology": self.morphology,
"brackets": self.brackets}
cdef class DocAnnotation:
def __init__(self, cats=None, links=None):
self.cats = cats if cats else {}
self.links = links if links else {}
@classmethod
def from_dict(cls, doc_dict):
return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None))
def to_dict(self):
return {"cats": self.cats, "links": self.links}
cdef class Example:
def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
make_projective=False, ignore_misaligned=False, goldparse=None):
""" Doc can either be text, or an actual Doc """
self.doc = doc
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
self.token_annotations = token_annotations if token_annotations else []
self.make_projective = make_projective
self.ignore_misaligned = ignore_misaligned
self.goldparse = goldparse
@classmethod
def from_gold(cls, goldparse, doc=None):
doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
token_annotation = goldparse.get_token_annotation()
return cls(doc_annotation, [token_annotation], doc)
@classmethod
def from_dict(cls, example_dict, doc=None):
token_dicts = example_dict["token_annotations"]
token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts]
doc_dict = example_dict["doc_annotation"]
doc_annotation = DocAnnotation.from_dict(doc_dict)
return cls(doc_annotation, token_annotations, doc)
def to_dict(self):
""" Note that this method does NOT export the doc, only the annotations ! """
token_dicts = [t.to_dict() for t in self.token_annotations]
doc_dict = self.doc_annotation.to_dict()
return {"token_annotations": token_dicts, "doc_annotation": doc_dict}
@property
def text(self):
if self.doc is None:
return None
if isinstance(self.doc, Doc):
return self.doc.text
return self.doc
@property
def gold(self):
if self.goldparse is None:
doc, gold = self.get_gold_parses(merge=True)[0]
self.goldparse = gold
return self.goldparse
def add_token_annotation(self, ids=None, words=None, tags=None, heads=None,
deps=None, entities=None, morphology=None, brackets=None):
t = TokenAnnotation(ids=ids, words=words, tags=tags,
heads=heads, deps=deps, entities=entities,
morphology=morphology, brackets=brackets)
self.token_annotations.append(t)
def add_doc_annotation(self, cats=None, links=None):
if cats:
self.doc_annotation.cats.update(cats)
if links:
self.doc_annotation.links.update(links)
def merge_sents(self):
""" Merge the list of token annotations into one object and return this new object """
m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation)
m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], []
m_brackets = []
i = 0
for t in self.token_annotations:
m_ids.extend(id_ + i for id_ in t.ids)
m_words.extend(t.words)
m_tags.extend(t.tags)
m_heads.extend(head + i if head else None for head in t.heads)
m_deps.extend(t.deps)
m_ents.extend(t.entities)
m_morph.extend(t.morphology)
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
for b in t.brackets)
i += len(t.ids)
m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags,
heads=m_heads, deps=m_deps, entities=m_ents,
morphology=m_morph, brackets=m_brackets)
return m_example
def get_gold_parses(self, merge=False, vocab=None):
"""Return a list of (doc, GoldParse) objects.
If merge is set to True, add all Token annotations to one big list."""
d = self.doc_annotation
# merging different sentences
if merge:
merged_example = self.merge_sents()
assert(len(merged_example.token_annotations)) == 1
t = merged_example.token_annotations[0]
m_doc = merged_example.doc
if not m_doc:
if not vocab:
raise ValueError(Errors.E998)
m_doc = Doc(vocab, words=t.words)
try:
gp = GoldParse.from_annotation(m_doc, d, t, make_projective=self.make_projective)
except AlignmentError:
if self.ignore_misaligned:
gp = None
else:
raise
return [(self.doc, gp)]
# we only have one sentence and an appropriate doc
elif len(self.token_annotations) == 1 and self.doc is not None:
t = self.token_annotations[0]
try:
gp = GoldParse.from_annotation(self.doc, d, t, make_projective=self.make_projective)
except AlignmentError:
if self.ignore_misaligned:
gp = None
else:
raise
return [(self.doc, gp)]
# not merging: one GoldParse per 'sentence', defining docs with the words from each sentence
else:
parses = []
for t in self.token_annotations:
if not vocab:
raise ValueError(Errors.E998)
t_doc = Doc(vocab, words=t.words)
try:
gp = GoldParse.from_annotation(t_doc, d, t, make_projective=self.make_projective)
except AlignmentError:
if self.ignore_misaligned:
gp = None
else:
raise
if gp is not None:
parses.append((t_doc, gp))
return parses
@classmethod
def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
"""
Return a list of Example objects, from a variety of input formats.
make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
"""
if isinstance(examples, Example):
return [examples]
if isinstance(examples, tuple):
examples = [examples]
converted_examples = []
for ex in examples:
# convert string to Doc to Example
if isinstance(ex, basestring_):
if keep_raw_text:
converted_examples.append(Example(doc=ex))
else:
doc = make_doc(ex)
converted_examples.append(Example(doc=doc))
# convert Doc to Example
elif isinstance(ex, Doc):
converted_examples.append(Example(doc=ex))
# convert tuples to Example
elif isinstance(ex, tuple) and len(ex) == 2:
doc, gold = ex
gold_dict = {}
# convert string to Doc
if isinstance(doc, basestring_) and not keep_raw_text:
doc = make_doc(doc)
# convert dict to GoldParse
if isinstance(gold, dict):
gold_dict = gold
if doc is not None or gold.get("words", None) is not None:
gold = GoldParse(doc, **gold)
else:
gold = None
if gold is not None:
converted_examples.append(Example.from_gold(goldparse=gold, doc=doc))
else:
raise ValueError(Errors.E999.format(gold_dict=gold_dict))
else:
converted_examples.append(ex)
return converted_examples
2015-03-09 08:46:22 +03:00
cdef class GoldParse:
"""Collection for training annotations.
DOCS: https://spacy.io/api/goldparse
"""
@classmethod
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
return cls(doc, words=token_annotation.words, tags=token_annotation.tags,
heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities,
morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links,
Add textcat to train CLI (#4226) * Add doc.cats to spacy.gold at the paragraph level Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in the spacy JSON training format at the paragraph level. * `spacy.gold.docs_to_json()` writes `docs.cats` * `GoldCorpus` reads in cats in each `GoldParse` * Update instances of gold_tuples to handle cats Update iteration over gold_tuples / gold_parses to handle addition of cats at the paragraph level. * Add textcat to train CLI * Add textcat options to train CLI * Add textcat labels in `TextCategorizer.begin_training()` * Add textcat evaluation to `Scorer`: * For binary exclusive classes with provided label: F1 for label * For 2+ exclusive classes: F1 macro average * For multilabel (not exclusive): ROC AUC macro average (currently relying on sklearn) * Provide user info on textcat evaluation settings, potential incompatibilities * Provide pipeline to Scorer in `Language.evaluate` for textcat config * Customize train CLI output to include only metrics relevant to current pipeline * Add textcat evaluation to evaluate CLI * Fix handling of unset arguments and config params Fix handling of unset arguments and model confiug parameters in Scorer initialization. * Temporarily add sklearn requirement * Remove sklearn version number * Improve Scorer handling of models without textcats * Fixing Scorer handling of models without textcats * Update Scorer output for python 2.7 * Modify inf in Scorer for python 2.7 * Auto-format Also make small adjustments to make auto-formatting with black easier and produce nicer results * Move error message to Errors * Update documentation * Add cats to annotation JSON format [ci skip] * Fix tpl flag and docs [ci skip] * Switch to internal roc_auc_score Switch to internal `roc_auc_score()` adapted from scikit-learn. * Add AUCROCScore tests and improve errors/warnings * Add tests for AUCROCScore and roc_auc_score * Add missing error for only positive/negative values * Remove unnecessary warnings and errors * Make reduced roc_auc_score functions private Because most of the checks and warnings have been stripped for the internal functions and access is only intended through `ROCAUCScore`, make the functions for roc_auc_score adapted from scikit-learn private. * Check that data corresponds with multilabel flag Check that the training instances correspond with the multilabel flag, adding the multilabel flag if required. * Add textcat score to early stopping check * Add more checks to debug-data for textcat * Add example training data for textcat * Add more checks to textcat train CLI * Check configuration when extending base model * Fix typos * Update textcat example data * Provide licensing details and licenses for data * Remove two labels with no positive instances from jigsaw-toxic-comment data. Co-authored-by: Ines Montani <ines@ines.io>
2019-09-15 23:31:31 +03:00
make_projective=make_projective)
def get_token_annotation(self):
ids = None
if self.words:
ids = list(range(len(self.words)))
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
heads=self.heads, deps=self.labels, entities=self.ner,
morphology=self.morphology)
def __init__(self, doc, words=None, tags=None, morphology=None,
2017-10-27 18:02:55 +03:00
heads=None, deps=None, entities=None, make_projective=False,
cats=None, links=None):
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations.
2017-10-27 18:02:55 +03:00
heads (iterable): A sequence of integers, representing syntactic
head offsets.
deps (iterable): A sequence of strings, representing the syntactic
relation types.
entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions.
cats (dict): Labels for text classification. Each key in the dictionary
may be a string or an int, or a `(start_char, end_char, label)`
tuple, indicating that the label is applied to only part of the
document (usually a sentence). Unlike entity annotations, label
annotations can overlap, i.e. a single word can be covered by
multiple labelled spans. The TextCategorizer component expects
2017-10-27 18:02:55 +03:00
true examples of a label to have the value 1.0, and negative
examples of a label to have the value 0.0. Labels not in the
dictionary are treated as missing - the gradient for those labels
will be zero.
links (dict): A dict with `(start_char, end_char)` keys,
and the values being dicts with kb_id:value entries,
representing the external IDs in a knowledge base (KB)
mapped to either 1.0 or 0.0, indicating positive and
negative examples respectively.
RETURNS (GoldParse): The newly constructed object.
2016-11-01 14:25:36 +03:00
"""
2015-03-09 08:46:22 +03:00
self.mem = Pool()
self.loss = 0
2016-10-16 00:55:07 +03:00
self.length = len(doc)
2015-03-09 08:46:22 +03:00
self.cats = {} if cats is None else dict(cats)
self.links = {} if links is None else dict(links)
# avoid allocating memory if the doc does not contain any tokens
if self.length > 0:
if not words:
words = [token.text for token in doc]
if not tags:
tags = [None for _ in words]
if not heads:
heads = [None for _ in words]
if not deps:
deps = [None for _ in words]
if not morphology:
morphology = [None for _ in words]
if entities is None:
entities = ["-" for _ in words]
elif len(entities) == 0:
entities = ["O" for _ in words]
else:
# Translate the None values to '-', to make processing easier.
# See Issue #2603
entities = [(ent if ent is not None else "-") for ent in entities]
if not isinstance(entities[0], basestring_):
# Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities)
# These are filled by the tagger/parser/entity recogniser
self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
self.c.has_dep = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.sent_start = <int*>self.mem.alloc(len(doc), sizeof(int))
self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
self.words = [None] * len(doc)
self.tags = [None] * len(doc)
self.heads = [None] * len(doc)
self.labels = [None] * len(doc)
self.ner = [None] * len(doc)
self.morphology = [None] * len(doc)
# This needs to be done before we align the words
if make_projective and heads is not None and deps is not None:
heads, deps = nonproj.projectivize(heads, deps)
# Do many-to-one alignment for misaligned tokens.
# If we over-segment, we'll have one gold word that covers a sequence
# of predicted words
# If we under-segment, we'll have one predicted word that covers a
# sequence of gold words.
# If we "mis-segment", we'll have a sequence of predicted words covering
# a sequence of gold words. That's many-to-many -- we don't do that.
cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
heads=heads, deps=deps, entities=entities, morphology=morphology,
brackets=[])
for i, gold_i in enumerate(self.cand_to_gold):
if doc[i].text.isspace():
self.words[i] = doc[i].text
self.tags[i] = "_SP"
2017-05-26 01:15:09 +03:00
self.heads[i] = None
self.labels[i] = None
self.ner[i] = None
self.morphology[i] = set()
if gold_i is None:
if i in i2j_multi:
self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]]
self.morphology[i] = morphology[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last
if not is_last:
self.heads[i] = i+1
self.labels[i] = "subtok"
else:
head_i = heads[i2j_multi[i]]
if head_i:
self.heads[i] = self.gold_to_cand[head_i]
self.labels[i] = deps[i2j_multi[i]]
# Now set NER...This is annoying because if we've split
# got an entity word split into two, we need to adjust the
# BILUO tags. We can't have BB or LL etc.
# Case 1: O -- easy.
ner_tag = entities[i2j_multi[i]]
if ner_tag == "O":
self.ner[i] = "O"
# Case 2: U. This has to become a B I* L sequence.
elif ner_tag.startswith("U-"):
if is_first:
self.ner[i] = ner_tag.replace("U-", "B-", 1)
elif is_last:
self.ner[i] = ner_tag.replace("U-", "L-", 1)
else:
self.ner[i] = ner_tag.replace("U-", "I-", 1)
# Case 3: L. If not last, change to I.
elif ner_tag.startswith("L-"):
if is_last:
self.ner[i] = ner_tag
else:
self.ner[i] = ner_tag.replace("L-", "I-", 1)
# Case 4: I. Stays correct
elif ner_tag.startswith("I-"):
self.ner[i] = ner_tag
2017-05-26 01:15:09 +03:00
else:
self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i]
self.morphology[i] = morphology[gold_i]
if heads[gold_i] is None:
self.heads[i] = None
else:
self.heads[i] = self.gold_to_cand[heads[gold_i]]
self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i]
# Prevent whitespace that isn't within entities from being tagged as
# an entity.
for i in range(len(self.ner)):
if self.tags[i] == "_SP":
prev_ner = self.ner[i-1] if i >= 1 else None
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
if prev_ner == "O" or next_ner == "O":
self.ner[i] = "O"
cycle = nonproj.contains_cycle(self.heads)
if cycle is not None:
raise ValueError(Errors.E069.format(cycle=cycle,
cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
doc_tokens=" ".join(words[:50])))
def __len__(self):
"""Get the number of gold-standard tokens.
2017-03-15 17:29:42 +03:00
RETURNS (int): The number of gold-standard tokens.
2016-11-01 14:25:36 +03:00
"""
return self.length
2015-03-09 08:46:22 +03:00
@property
def is_projective(self):
"""Whether the provided syntactic annotations form a projective
dependency tree.
"""
return not nonproj.is_nonproj_tree(self.heads)
property sent_starts:
def __get__(self):
return [self.c.sent_start[i] for i in range(self.length)]
def __set__(self, sent_starts):
for gold_i, is_sent_start in enumerate(sent_starts):
i = self.gold_to_cand[gold_i]
if i is not None:
if is_sent_start in (1, True):
self.c.sent_start[i] = 1
elif is_sent_start in (-1, False):
self.c.sent_start[i] = -1
else:
self.c.sent_start[i] = 0
2015-02-21 19:06:58 +03:00
def docs_to_json(docs, id=0):
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
"""Convert a list of Doc objects into the JSON-serializable format used by
the spacy train command.
docs (iterable / Doc): The Doc object(s) to convert.
id (int): Id for the JSON.
2019-10-28 14:36:23 +03:00
RETURNS (dict): The data in spaCy's JSON format
- each input doc will be treated as a paragraph in the output doc
💫 New JSON helpers, training data internals & CLI rewrite (#2932) * Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
2018-11-30 22:16:14 +03:00
"""
if isinstance(docs, Doc):
docs = [docs]
json_doc = {"id": id, "paragraphs": []}
for i, doc in enumerate(docs):
Add textcat to train CLI (#4226) * Add doc.cats to spacy.gold at the paragraph level Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in the spacy JSON training format at the paragraph level. * `spacy.gold.docs_to_json()` writes `docs.cats` * `GoldCorpus` reads in cats in each `GoldParse` * Update instances of gold_tuples to handle cats Update iteration over gold_tuples / gold_parses to handle addition of cats at the paragraph level. * Add textcat to train CLI * Add textcat options to train CLI * Add textcat labels in `TextCategorizer.begin_training()` * Add textcat evaluation to `Scorer`: * For binary exclusive classes with provided label: F1 for label * For 2+ exclusive classes: F1 macro average * For multilabel (not exclusive): ROC AUC macro average (currently relying on sklearn) * Provide user info on textcat evaluation settings, potential incompatibilities * Provide pipeline to Scorer in `Language.evaluate` for textcat config * Customize train CLI output to include only metrics relevant to current pipeline * Add textcat evaluation to evaluate CLI * Fix handling of unset arguments and config params Fix handling of unset arguments and model confiug parameters in Scorer initialization. * Temporarily add sklearn requirement * Remove sklearn version number * Improve Scorer handling of models without textcats * Fixing Scorer handling of models without textcats * Update Scorer output for python 2.7 * Modify inf in Scorer for python 2.7 * Auto-format Also make small adjustments to make auto-formatting with black easier and produce nicer results * Move error message to Errors * Update documentation * Add cats to annotation JSON format [ci skip] * Fix tpl flag and docs [ci skip] * Switch to internal roc_auc_score Switch to internal `roc_auc_score()` adapted from scikit-learn. * Add AUCROCScore tests and improve errors/warnings * Add tests for AUCROCScore and roc_auc_score * Add missing error for only positive/negative values * Remove unnecessary warnings and errors * Make reduced roc_auc_score functions private Because most of the checks and warnings have been stripped for the internal functions and access is only intended through `ROCAUCScore`, make the functions for roc_auc_score adapted from scikit-learn private. * Check that data corresponds with multilabel flag Check that the training instances correspond with the multilabel flag, adding the multilabel flag if required. * Add textcat score to early stopping check * Add more checks to debug-data for textcat * Add example training data for textcat * Add more checks to textcat train CLI * Check configuration when extending base model * Fix typos * Update textcat example data * Provide licensing details and licenses for data * Remove two labels with no positive instances from jigsaw-toxic-comment data. Co-authored-by: Ines Montani <ines@ines.io>
2019-09-15 23:31:31 +03:00
json_para = {'raw': doc.text, "sentences": [], "cats": []}
for cat, val in doc.cats.items():
json_cat = {"label": cat, "value": val}
json_para["cats"].append(json_cat)
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
for j, sent in enumerate(doc.sents):
json_sent = {"tokens": [], "brackets": []}
for token in sent:
json_token = {"id": token.i, "orth": token.text}
if doc.is_tagged:
json_token["tag"] = token.tag_
if doc.is_parsed:
json_token["head"] = token.head.i-token.i
json_token["dep"] = token.dep_
json_token["ner"] = biluo_tags[token.i]
json_sent["tokens"].append(json_token)
json_para["sentences"].append(json_sent)
json_doc["paragraphs"].append(json_para)
2019-04-08 13:53:16 +03:00
return json_doc
def biluo_tags_from_offsets(doc, entities, missing="O"):
2017-10-27 18:02:55 +03:00
"""Encode labelled spans into per-token tags, using the
Begin/In/Last/Unit/Out scheme (BILUO).
doc (Doc): The document that the entity offsets refer to. The output tags
will refer to the token boundaries within the document.
2017-10-27 18:02:55 +03:00
entities (iterable): A sequence of `(start, end, label)` triples. `start`
and `end` should be character-offset integers denoting the slice into
the original string.
RETURNS (list): A list of unicode strings, describing the tags. Each tag
string will be of the form either "", "O" or "{action}-{label}", where
action is one of "B", "I", "L", "U". The string "-" is used where the
2017-10-27 18:02:55 +03:00
entity offsets don't align with the tokenization in the `Doc` object.
The training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
EXAMPLE:
>>> text = 'I like London.'
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
>>> doc = nlp.tokenizer(text)
>>> tags = biluo_tags_from_offsets(doc, entities)
>>> assert tags == ["O", "O", 'U-LOC', "O"]
"""
# Ensure no overlapping entity labels exist
tokens_in_ents = {}
2019-10-28 14:36:23 +03:00
starts = {token.idx: token.i for token in doc}
ends = {token.idx + len(token): token.i for token in doc}
biluo = ["-" for _ in doc]
# Handle entity cases
for start_char, end_char, label in entities:
for token_index in range(start_char, end_char):
if token_index in tokens_in_ents.keys():
raise ValueError(Errors.E103.format(
span1=(tokens_in_ents[token_index][0],
tokens_in_ents[token_index][1],
tokens_in_ents[token_index][2]),
span2=(start_char, end_char, label)))
tokens_in_ents[token_index] = (start_char, end_char, label)
start_token = starts.get(start_char)
end_token = ends.get(end_char)
# Only interested if the tokenization is correct
if start_token is not None and end_token is not None:
if start_token == end_token:
biluo[start_token] = "U-%s" % label
else:
biluo[start_token] = "B-%s" % label
for i in range(start_token+1, end_token):
biluo[i] = "I-%s" % label
biluo[end_token] = "L-%s" % label
# Now distinguish the O cases from ones where we miss the tokenization
entity_chars = set()
for start_char, end_char, label in entities:
for i in range(start_char, end_char):
entity_chars.add(i)
for token in doc:
for i in range(token.idx, token.idx + len(token)):
if i in entity_chars:
break
else:
biluo[token.i] = missing
return biluo
def spans_from_biluo_tags(doc, tags):
"""Encode per-token tags following the BILUO scheme into Span object, e.g.
to overwrite the doc.ents.
doc (Doc): The document that the BILUO tags refer to.
entities (iterable): A sequence of BILUO tags with each tag describing one
token. Each tags string will be of the form of either "", "O" or
"{action}-{label}", where action is one of "B", "I", "L", "U".
RETURNS (list): A sequence of Span objects.
"""
token_offsets = tags_to_entities(tags)
spans = []
for label, start_idx, end_idx in token_offsets:
span = Span(doc, start_idx, end_idx + 1, label=label)
spans.append(span)
return spans
def offsets_from_biluo_tags(doc, tags):
"""Encode per-token tags following the BILUO scheme into entity offsets.
doc (Doc): The document that the BILUO tags refer to.
entities (iterable): A sequence of BILUO tags with each tag describing one
token. Each tags string will be of the form of either "", "O" or
"{action}-{label}", where action is one of "B", "I", "L", "U".
RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
`end` will be character-offset integers denoting the slice into the
original string.
"""
spans = spans_from_biluo_tags(doc, tags)
return [(span.start_char, span.end_char, span.label_) for span in spans]
2015-02-21 19:06:58 +03:00
def is_punct_label(label):
return label == "P" or label.lower() == "punct"