Merge branch 'whatif/arrow' of https://github.com/explosion/spaCy into whatif/arrow

This commit is contained in:
Matthew Honnibal 2020-06-20 02:36:40 +02:00
commit a79f0598a6
13 changed files with 47 additions and 80 deletions

View File

@ -9,7 +9,6 @@ max_length = 0
limit = 0 limit = 0
# Data augmentation # Data augmentation
orth_variant_level = 0.0 orth_variant_level = 0.0
noise_level = 0.0
dropout = 0.1 dropout = 0.1
# Controls early-stopping. 0 or -1 mean unlimited. # Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600 patience = 1600

View File

@ -9,7 +9,6 @@ max_length = 0
limit = 0 limit = 0
# Data augmentation # Data augmentation
orth_variant_level = 0.0 orth_variant_level = 0.0
noise_level = 0.0
dropout = 0.1 dropout = 0.1
# Controls early-stopping. 0 or -1 mean unlimited. # Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600 patience = 1600

View File

@ -6,7 +6,6 @@ init_tok2vec = null
vectors = null vectors = null
max_epochs = 100 max_epochs = 100
orth_variant_level = 0.0 orth_variant_level = 0.0
noise_level = 0.0
gold_preproc = true gold_preproc = true
max_length = 0 max_length = 0
use_gpu = 0 use_gpu = 0

View File

@ -6,7 +6,6 @@ init_tok2vec = null
vectors = null vectors = null
max_epochs = 100 max_epochs = 100
orth_variant_level = 0.0 orth_variant_level = 0.0
noise_level = 0.0
gold_preproc = true gold_preproc = true
max_length = 0 max_length = 0
use_gpu = -1 use_gpu = -1

View File

@ -4,7 +4,7 @@ from .download import download # noqa: F401
from .info import info # noqa: F401 from .info import info # noqa: F401
from .package import package # noqa: F401 from .package import package # noqa: F401
from .profile import profile # noqa: F401 from .profile import profile # noqa: F401
from .train_from_config import train_cli # noqa: F401 from .train import train_cli # noqa: F401
from .pretrain import pretrain # noqa: F401 from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401 from .debug_data import debug_data # noqa: F401
from .evaluate import evaluate # noqa: F401 from .evaluate import evaluate # noqa: F401

View File

@ -371,7 +371,6 @@ def create_train_batches(nlp, corpus, cfg):
train_examples = list( train_examples = list(
corpus.train_dataset( corpus.train_dataset(
nlp, nlp,
noise_level=cfg["noise_level"], # I think this is deprecated?
orth_variant_level=cfg["orth_variant_level"], orth_variant_level=cfg["orth_variant_level"],
gold_preproc=cfg["gold_preproc"], gold_preproc=cfg["gold_preproc"],
max_length=cfg["max_length"], max_length=cfg["max_length"],

View File

@ -2,6 +2,15 @@ import random
import itertools import itertools
def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming
raw_text = example.text
orig_dict = example.to_dict()
variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level)
doc = nlp.make_doc(variant_text)
orig_dict["token_annotation"] = variant_token_annot
return example.from_dict(doc, orig_dict)
def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
if random.random() >= orth_variant_level: if random.random() >= orth_variant_level:
return raw_text, orig_token_dict return raw_text, orig_token_dict
@ -98,23 +107,3 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
raw_idx += 1 raw_idx += 1
raw = variant_raw raw = variant_raw
return raw, token_dict return raw, token_dict
def add_noise(orig, noise_level):
if random.random() >= noise_level:
return orig
elif type(orig) == list:
corrupted = [_corrupt(word, noise_level) for word in orig]
corrupted = [w for w in corrupted if w]
return corrupted
else:
return "".join(_corrupt(c, noise_level) for c in orig)
def _corrupt(c, noise_level):
if random.random() >= noise_level:
return c
elif c in [".", "'", "!", "?", ","]:
return "\n"
else:
return c.lower()

View File

@ -8,7 +8,7 @@ from ..tokens import Doc
from .. import util from .. import util
from ..errors import Errors, AlignmentError from ..errors import Errors, AlignmentError
from .gold_io import read_json_file, json_to_annotations from .gold_io import read_json_file, json_to_annotations
from .augment import make_orth_variants, add_noise from .augment import make_orth_variants
from .example import Example from .example import Example
@ -148,7 +148,6 @@ class GoldCorpus(object):
nlp, nlp,
gold_preproc=False, gold_preproc=False,
max_length=None, max_length=None,
noise_level=0.0,
orth_variant_level=0.0, orth_variant_level=0.0,
ignore_misaligned=False, ignore_misaligned=False,
): ):
@ -160,7 +159,6 @@ class GoldCorpus(object):
train_annotations, train_annotations,
gold_preproc, gold_preproc,
max_length=max_length, max_length=max_length,
noise_level=noise_level,
orth_variant_level=orth_variant_level, orth_variant_level=orth_variant_level,
make_projective=True, make_projective=True,
ignore_misaligned=ignore_misaligned, ignore_misaligned=ignore_misaligned,
@ -194,33 +192,31 @@ class GoldCorpus(object):
annotations, annotations,
gold_preproc, gold_preproc,
max_length=None, max_length=None,
noise_level=0.0,
orth_variant_level=0.0, orth_variant_level=0.0,
make_projective=False, make_projective=False,
ignore_misaligned=False, ignore_misaligned=False,
): ):
""" Setting gold_preproc will result in creating a doc per sentence """ """ Setting gold_preproc will result in creating a doc per sentence """
for eg_dict in annotations: for eg_dict in annotations:
token_annot = eg_dict.get("token_annotation", {})
if eg_dict["text"]: if eg_dict["text"]:
example = Example.from_dict( doc = nlp.make_doc(eg_dict["text"])
nlp.make_doc(eg_dict["text"]), elif "words" in token_annot:
eg_dict doc = Doc(nlp.vocab, words=token_annot["words"])
)
else: else:
example = Example.from_dict( raise ValueError("Expecting either 'text' or token_annotation.words annotation")
Doc(nlp.vocab, words=eg_dict["words"]),
eg_dict
)
if gold_preproc: if gold_preproc:
# TODO: Data augmentation variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level)
doc = nlp.make_doc(variant_text)
eg_dict["token_annotation"] = variant_token_annot
example = Example.from_dict(doc, eg_dict)
examples = example.split_sents() examples = example.split_sents()
else: else:
example = Example.from_dict(doc, eg_dict)
examples = [example] examples = [example]
for eg in examples: for eg in examples:
if (not max_length) or len(eg.predicted) < max_length: if (not max_length) or len(eg.predicted) < max_length:
if ignore_misaligned:
try:
_ = eg._deprecated_get_gold()
except AlignmentError:
continue
yield eg yield eg

View File

@ -126,7 +126,7 @@ cdef class Example:
"doc_annotation": { "doc_annotation": {
"cats": dict(self.reference.cats), "cats": dict(self.reference.cats),
"entities": biluo_tags_from_doc(self.reference), "entities": biluo_tags_from_doc(self.reference),
"links": [], # TODO "links": self._links_to_dict()
}, },
"token_annotation": { "token_annotation": {
"ids": [t.i+1 for t in self.reference], "ids": [t.i+1 for t in self.reference],
@ -141,6 +141,14 @@ cdef class Example:
} }
} }
def _links_to_dict(self):
links = {}
for ent in self.reference.ents:
if ent.kb_id_:
links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
return links
def split_sents(self): def split_sents(self):
""" Split the token annotations into multiple Examples based on """ Split the token annotations into multiple Examples based on
sent_starts and return a list of the new Examples""" sent_starts and return a list of the new Examples"""

View File

@ -646,20 +646,6 @@ class Language(object):
sgd(W, dW, key=key) sgd(W, dW, key=key)
return losses return losses
def preprocess_gold(self, examples):
"""Can be called before training to pre-process gold data. By default,
it handles nonprojectivity and adds missing tags to the tag map.
examples (iterable): `Example` objects.
YIELDS (tuple): `Example` objects.
"""
# TODO: This is deprecated right?
for name, proc in self.pipeline:
if hasattr(proc, "preprocess_gold"):
examples = proc.preprocess_gold(examples)
for eg in examples:
yield eg
def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg): def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and """Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager. optimizer. Used as a contextmanager.

View File

@ -459,9 +459,9 @@ cdef class ArcEager(TransitionSystem):
actions[RIGHT][label] = 1 actions[RIGHT][label] = 1
actions[REDUCE][label] = 1 actions[REDUCE][label] = 1
for example in kwargs.get('gold_parses', []): for example in kwargs.get('gold_parses', []):
heads, labels = nonproj.projectivize(example.token_annotation.heads, heads, labels = nonproj.projectivize(example.get_aligned("HEAD"),
example.token_annotation.deps) example.get_aligned("DEP"))
for child, head, label in zip(example.token_annotation.ids, heads, labels): for child, head, label in zip(example.get_aligned("ID"), heads, labels):
if label.upper() == 'ROOT' : if label.upper() == 'ROOT' :
label = 'ROOT' label = 'ROOT'
if head == child: if head == child:

View File

@ -78,8 +78,8 @@ def is_decorated(label):
def count_decorated_labels(gold_data): def count_decorated_labels(gold_data):
freqs = {} freqs = {}
for example in gold_data: for example in gold_data:
proj_heads, deco_deps = projectivize(example.token_annotation.heads, proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
example.token_annotation.deps) example.get_aligned("DEP"))
# set the label to ROOT for each root dependent # set the label to ROOT for each root dependent
deco_deps = ['ROOT' if head == i else deco_deps[i] deco_deps = ['ROOT' if head == i else deco_deps[i]
for i, head in enumerate(proj_heads)] for i, head in enumerate(proj_heads)]

View File

@ -11,6 +11,7 @@ import pytest
import srsly import srsly
from .util import make_tempdir from .util import make_tempdir
from ..gold.augment import make_orth_variants_example
@pytest.fixture @pytest.fixture
@ -200,13 +201,16 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
words = ["I flew", "to", "San Francisco", "Valley", "."] words = ["I flew", "to", "San Francisco", "Valley", "."]
spaces = [True, True, True, False, False] spaces = [True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] offset_start = len("I flew to ")
links = {(len("I flew to "), len("I flew to San Francisco Valley")): {"Q816843": 1.0}} offset_end = len("I flew to San Francisco Valley")
entities = [(offset_start, offset_end, "LOC")]
links = {(offset_start, offset_end): {"Q816843": 1.0}}
gold_words = ["I", "flew to", "San", "Francisco Valley", "."] gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links}) example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links})
assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2] assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2]
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""] assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""]
assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""] assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""]
assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0}
# additional whitespace tokens in GoldParse words # additional whitespace tokens in GoldParse words
words, spaces = get_words_and_spaces( words, spaces = get_words_and_spaces(
@ -384,8 +388,8 @@ def test_make_orth_variants(doc):
goldcorpus = GoldCorpus(str(json_file), str(json_file)) goldcorpus = GoldCorpus(str(json_file), str(json_file))
# due to randomness, test only that this runs with no errors for now # due to randomness, test only that this runs with no errors for now
train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2)) train_example = next(goldcorpus.train_dataset(nlp))
train_goldparse = get_parses_from_example(train_reloaded_example)[0][1] variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -494,18 +498,7 @@ def test_split_sents(merged_dict):
Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]), Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]),
merged_dict merged_dict
) )
assert len(get_parses_from_example( assert example.text == "Hi there everyone It is just me"
example,
merge=False,
vocab=nlp.vocab,
make_projective=False)
) == 2
assert len(get_parses_from_example(
example,
merge=True,
vocab=nlp.vocab,
make_projective=False
)) == 1
split_examples = example.split_sents() split_examples = example.split_sents()
assert len(split_examples) == 2 assert len(split_examples) == 2