diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg index f76336d84..337fe0379 100644 --- a/examples/experiments/onto-joint/defaults.cfg +++ b/examples/experiments/onto-joint/defaults.cfg @@ -9,7 +9,6 @@ max_length = 0 limit = 0 # Data augmentation orth_variant_level = 0.0 -noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 diff --git a/examples/experiments/onto-joint/pretrain.cfg b/examples/experiments/onto-joint/pretrain.cfg index 40885b6e8..83991f888 100644 --- a/examples/experiments/onto-joint/pretrain.cfg +++ b/examples/experiments/onto-joint/pretrain.cfg @@ -9,7 +9,6 @@ max_length = 0 limit = 0 # Data augmentation orth_variant_level = 0.0 -noise_level = 0.0 dropout = 0.1 # Controls early-stopping. 0 or -1 mean unlimited. patience = 1600 diff --git a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg index 905b5b4e0..f1b702a4e 100644 --- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg +++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg @@ -6,7 +6,6 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 -noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = 0 diff --git a/examples/experiments/ptb-joint-pos-dep/defaults.cfg b/examples/experiments/ptb-joint-pos-dep/defaults.cfg index 7383116e7..1c946ac60 100644 --- a/examples/experiments/ptb-joint-pos-dep/defaults.cfg +++ b/examples/experiments/ptb-joint-pos-dep/defaults.cfg @@ -6,7 +6,6 @@ init_tok2vec = null vectors = null max_epochs = 100 orth_variant_level = 0.0 -noise_level = 0.0 gold_preproc = true max_length = 0 use_gpu = -1 diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 2ffbe2d0c..6f09c6884 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -4,7 +4,7 @@ from .download import download # noqa: F401 from .info import info # noqa: F401 from .package import package # noqa: F401 from .profile import profile # noqa: F401 -from .train_from_config import train_cli # noqa: F401 +from .train import train_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 from .evaluate import evaluate # noqa: F401 diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train.py similarity index 99% rename from spacy/cli/train_from_config.py rename to spacy/cli/train.py index 3a4d28356..fb4347158 100644 --- a/spacy/cli/train_from_config.py +++ b/spacy/cli/train.py @@ -371,7 +371,6 @@ def create_train_batches(nlp, corpus, cfg): train_examples = list( corpus.train_dataset( nlp, - noise_level=cfg["noise_level"], # I think this is deprecated? orth_variant_level=cfg["orth_variant_level"], gold_preproc=cfg["gold_preproc"], max_length=cfg["max_length"], diff --git a/spacy/gold/augment.py b/spacy/gold/augment.py index a129793c8..dda51cda6 100644 --- a/spacy/gold/augment.py +++ b/spacy/gold/augment.py @@ -2,6 +2,15 @@ import random import itertools +def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming + raw_text = example.text + orig_dict = example.to_dict() + variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level) + doc = nlp.make_doc(variant_text) + orig_dict["token_annotation"] = variant_token_annot + return example.from_dict(doc, orig_dict) + + def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): if random.random() >= orth_variant_level: return raw_text, orig_token_dict @@ -98,23 +107,3 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): raw_idx += 1 raw = variant_raw return raw, token_dict - - -def add_noise(orig, noise_level): - if random.random() >= noise_level: - return orig - elif type(orig) == list: - corrupted = [_corrupt(word, noise_level) for word in orig] - corrupted = [w for w in corrupted if w] - return corrupted - else: - return "".join(_corrupt(c, noise_level) for c in orig) - - -def _corrupt(c, noise_level): - if random.random() >= noise_level: - return c - elif c in [".", "'", "!", "?", ","]: - return "\n" - else: - return c.lower() diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index d55845fb8..c84f8355f 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -8,7 +8,7 @@ from ..tokens import Doc from .. import util from ..errors import Errors, AlignmentError from .gold_io import read_json_file, json_to_annotations -from .augment import make_orth_variants, add_noise +from .augment import make_orth_variants from .example import Example @@ -148,7 +148,6 @@ class GoldCorpus(object): nlp, gold_preproc=False, max_length=None, - noise_level=0.0, orth_variant_level=0.0, ignore_misaligned=False, ): @@ -160,7 +159,6 @@ class GoldCorpus(object): train_annotations, gold_preproc, max_length=max_length, - noise_level=noise_level, orth_variant_level=orth_variant_level, make_projective=True, ignore_misaligned=ignore_misaligned, @@ -194,33 +192,31 @@ class GoldCorpus(object): annotations, gold_preproc, max_length=None, - noise_level=0.0, orth_variant_level=0.0, make_projective=False, ignore_misaligned=False, ): """ Setting gold_preproc will result in creating a doc per sentence """ for eg_dict in annotations: + token_annot = eg_dict.get("token_annotation", {}) if eg_dict["text"]: - example = Example.from_dict( - nlp.make_doc(eg_dict["text"]), - eg_dict - ) + doc = nlp.make_doc(eg_dict["text"]) + elif "words" in token_annot: + doc = Doc(nlp.vocab, words=token_annot["words"]) else: - example = Example.from_dict( - Doc(nlp.vocab, words=eg_dict["words"]), - eg_dict - ) + raise ValueError("Expecting either 'text' or token_annotation.words annotation") + if gold_preproc: - # TODO: Data augmentation + variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level) + doc = nlp.make_doc(variant_text) + eg_dict["token_annotation"] = variant_token_annot + example = Example.from_dict(doc, eg_dict) examples = example.split_sents() + else: + example = Example.from_dict(doc, eg_dict) examples = [example] + for eg in examples: if (not max_length) or len(eg.predicted) < max_length: - if ignore_misaligned: - try: - _ = eg._deprecated_get_gold() - except AlignmentError: - continue yield eg diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 402228994..b5d1b1402 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -126,7 +126,7 @@ cdef class Example: "doc_annotation": { "cats": dict(self.reference.cats), "entities": biluo_tags_from_doc(self.reference), - "links": [], # TODO + "links": self._links_to_dict() }, "token_annotation": { "ids": [t.i+1 for t in self.reference], @@ -141,6 +141,14 @@ cdef class Example: } } + def _links_to_dict(self): + links = {} + for ent in self.reference.ents: + if ent.kb_id_: + links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0} + return links + + def split_sents(self): """ Split the token annotations into multiple Examples based on sent_starts and return a list of the new Examples""" diff --git a/spacy/language.py b/spacy/language.py index c168afeea..b9a84e1bb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -646,20 +646,6 @@ class Language(object): sgd(W, dW, key=key) return losses - def preprocess_gold(self, examples): - """Can be called before training to pre-process gold data. By default, - it handles nonprojectivity and adds missing tags to the tag map. - - examples (iterable): `Example` objects. - YIELDS (tuple): `Example` objects. - """ - # TODO: This is deprecated right? - for name, proc in self.pipeline: - if hasattr(proc, "preprocess_gold"): - examples = proc.preprocess_gold(examples) - for eg in examples: - yield eg - def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg): """Allocate models, pre-process training data and acquire a trainer and optimizer. Used as a contextmanager. diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 6ba7ad396..1512955a5 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -459,9 +459,9 @@ cdef class ArcEager(TransitionSystem): actions[RIGHT][label] = 1 actions[REDUCE][label] = 1 for example in kwargs.get('gold_parses', []): - heads, labels = nonproj.projectivize(example.token_annotation.heads, - example.token_annotation.deps) - for child, head, label in zip(example.token_annotation.ids, heads, labels): + heads, labels = nonproj.projectivize(example.get_aligned("HEAD"), + example.get_aligned("DEP")) + for child, head, label in zip(example.get_aligned("ID"), heads, labels): if label.upper() == 'ROOT' : label = 'ROOT' if head == child: diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 5b1f57d2b..eded53fac 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -78,8 +78,8 @@ def is_decorated(label): def count_decorated_labels(gold_data): freqs = {} for example in gold_data: - proj_heads, deco_deps = projectivize(example.token_annotation.heads, - example.token_annotation.deps) + proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"), + example.get_aligned("DEP")) # set the label to ROOT for each root dependent deco_deps = ['ROOT' if head == i else deco_deps[i] for i, head in enumerate(proj_heads)] diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index d98a93f2f..726492138 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -11,6 +11,7 @@ import pytest import srsly from .util import make_tempdir +from ..gold.augment import make_orth_variants_example @pytest.fixture @@ -200,13 +201,16 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): words = ["I flew", "to", "San Francisco", "Valley", "."] spaces = [True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - links = {(len("I flew to "), len("I flew to San Francisco Valley")): {"Q816843": 1.0}} + offset_start = len("I flew to ") + offset_end = len("I flew to San Francisco Valley") + entities = [(offset_start, offset_end, "LOC")] + links = {(offset_start, offset_end): {"Q816843": 1.0}} gold_words = ["I", "flew to", "San", "Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links}) assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2] assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""] assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""] + assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0} # additional whitespace tokens in GoldParse words words, spaces = get_words_and_spaces( @@ -384,8 +388,8 @@ def test_make_orth_variants(doc): goldcorpus = GoldCorpus(str(json_file), str(json_file)) # due to randomness, test only that this runs with no errors for now - train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2)) - train_goldparse = get_parses_from_example(train_reloaded_example)[0][1] + train_example = next(goldcorpus.train_dataset(nlp)) + variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2) @pytest.mark.parametrize( @@ -494,18 +498,7 @@ def test_split_sents(merged_dict): Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]), merged_dict ) - assert len(get_parses_from_example( - example, - merge=False, - vocab=nlp.vocab, - make_projective=False) - ) == 2 - assert len(get_parses_from_example( - example, - merge=True, - vocab=nlp.vocab, - make_projective=False - )) == 1 + assert example.text == "Hi there everyone It is just me" split_examples = example.split_sents() assert len(split_examples) == 2