mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-12 15:25:47 +03:00
Merge branch 'whatif/arrow' of https://github.com/explosion/spaCy into whatif/arrow
This commit is contained in:
commit
a79f0598a6
|
@ -9,7 +9,6 @@ max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
# Data augmentation
|
# Data augmentation
|
||||||
orth_variant_level = 0.0
|
orth_variant_level = 0.0
|
||||||
noise_level = 0.0
|
|
||||||
dropout = 0.1
|
dropout = 0.1
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
patience = 1600
|
patience = 1600
|
||||||
|
|
|
@ -9,7 +9,6 @@ max_length = 0
|
||||||
limit = 0
|
limit = 0
|
||||||
# Data augmentation
|
# Data augmentation
|
||||||
orth_variant_level = 0.0
|
orth_variant_level = 0.0
|
||||||
noise_level = 0.0
|
|
||||||
dropout = 0.1
|
dropout = 0.1
|
||||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||||
patience = 1600
|
patience = 1600
|
||||||
|
|
|
@ -6,7 +6,6 @@ init_tok2vec = null
|
||||||
vectors = null
|
vectors = null
|
||||||
max_epochs = 100
|
max_epochs = 100
|
||||||
orth_variant_level = 0.0
|
orth_variant_level = 0.0
|
||||||
noise_level = 0.0
|
|
||||||
gold_preproc = true
|
gold_preproc = true
|
||||||
max_length = 0
|
max_length = 0
|
||||||
use_gpu = 0
|
use_gpu = 0
|
||||||
|
|
|
@ -6,7 +6,6 @@ init_tok2vec = null
|
||||||
vectors = null
|
vectors = null
|
||||||
max_epochs = 100
|
max_epochs = 100
|
||||||
orth_variant_level = 0.0
|
orth_variant_level = 0.0
|
||||||
noise_level = 0.0
|
|
||||||
gold_preproc = true
|
gold_preproc = true
|
||||||
max_length = 0
|
max_length = 0
|
||||||
use_gpu = -1
|
use_gpu = -1
|
||||||
|
|
|
@ -4,7 +4,7 @@ from .download import download # noqa: F401
|
||||||
from .info import info # noqa: F401
|
from .info import info # noqa: F401
|
||||||
from .package import package # noqa: F401
|
from .package import package # noqa: F401
|
||||||
from .profile import profile # noqa: F401
|
from .profile import profile # noqa: F401
|
||||||
from .train_from_config import train_cli # noqa: F401
|
from .train import train_cli # noqa: F401
|
||||||
from .pretrain import pretrain # noqa: F401
|
from .pretrain import pretrain # noqa: F401
|
||||||
from .debug_data import debug_data # noqa: F401
|
from .debug_data import debug_data # noqa: F401
|
||||||
from .evaluate import evaluate # noqa: F401
|
from .evaluate import evaluate # noqa: F401
|
||||||
|
|
|
@ -371,7 +371,6 @@ def create_train_batches(nlp, corpus, cfg):
|
||||||
train_examples = list(
|
train_examples = list(
|
||||||
corpus.train_dataset(
|
corpus.train_dataset(
|
||||||
nlp,
|
nlp,
|
||||||
noise_level=cfg["noise_level"], # I think this is deprecated?
|
|
||||||
orth_variant_level=cfg["orth_variant_level"],
|
orth_variant_level=cfg["orth_variant_level"],
|
||||||
gold_preproc=cfg["gold_preproc"],
|
gold_preproc=cfg["gold_preproc"],
|
||||||
max_length=cfg["max_length"],
|
max_length=cfg["max_length"],
|
|
@ -2,6 +2,15 @@ import random
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
|
||||||
|
def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming
|
||||||
|
raw_text = example.text
|
||||||
|
orig_dict = example.to_dict()
|
||||||
|
variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level)
|
||||||
|
doc = nlp.make_doc(variant_text)
|
||||||
|
orig_dict["token_annotation"] = variant_token_annot
|
||||||
|
return example.from_dict(doc, orig_dict)
|
||||||
|
|
||||||
|
|
||||||
def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
||||||
if random.random() >= orth_variant_level:
|
if random.random() >= orth_variant_level:
|
||||||
return raw_text, orig_token_dict
|
return raw_text, orig_token_dict
|
||||||
|
@ -98,23 +107,3 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
||||||
raw_idx += 1
|
raw_idx += 1
|
||||||
raw = variant_raw
|
raw = variant_raw
|
||||||
return raw, token_dict
|
return raw, token_dict
|
||||||
|
|
||||||
|
|
||||||
def add_noise(orig, noise_level):
|
|
||||||
if random.random() >= noise_level:
|
|
||||||
return orig
|
|
||||||
elif type(orig) == list:
|
|
||||||
corrupted = [_corrupt(word, noise_level) for word in orig]
|
|
||||||
corrupted = [w for w in corrupted if w]
|
|
||||||
return corrupted
|
|
||||||
else:
|
|
||||||
return "".join(_corrupt(c, noise_level) for c in orig)
|
|
||||||
|
|
||||||
|
|
||||||
def _corrupt(c, noise_level):
|
|
||||||
if random.random() >= noise_level:
|
|
||||||
return c
|
|
||||||
elif c in [".", "'", "!", "?", ","]:
|
|
||||||
return "\n"
|
|
||||||
else:
|
|
||||||
return c.lower()
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ from ..tokens import Doc
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Errors, AlignmentError
|
from ..errors import Errors, AlignmentError
|
||||||
from .gold_io import read_json_file, json_to_annotations
|
from .gold_io import read_json_file, json_to_annotations
|
||||||
from .augment import make_orth_variants, add_noise
|
from .augment import make_orth_variants
|
||||||
from .example import Example
|
from .example import Example
|
||||||
|
|
||||||
|
|
||||||
|
@ -148,7 +148,6 @@ class GoldCorpus(object):
|
||||||
nlp,
|
nlp,
|
||||||
gold_preproc=False,
|
gold_preproc=False,
|
||||||
max_length=None,
|
max_length=None,
|
||||||
noise_level=0.0,
|
|
||||||
orth_variant_level=0.0,
|
orth_variant_level=0.0,
|
||||||
ignore_misaligned=False,
|
ignore_misaligned=False,
|
||||||
):
|
):
|
||||||
|
@ -160,7 +159,6 @@ class GoldCorpus(object):
|
||||||
train_annotations,
|
train_annotations,
|
||||||
gold_preproc,
|
gold_preproc,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
noise_level=noise_level,
|
|
||||||
orth_variant_level=orth_variant_level,
|
orth_variant_level=orth_variant_level,
|
||||||
make_projective=True,
|
make_projective=True,
|
||||||
ignore_misaligned=ignore_misaligned,
|
ignore_misaligned=ignore_misaligned,
|
||||||
|
@ -194,33 +192,31 @@ class GoldCorpus(object):
|
||||||
annotations,
|
annotations,
|
||||||
gold_preproc,
|
gold_preproc,
|
||||||
max_length=None,
|
max_length=None,
|
||||||
noise_level=0.0,
|
|
||||||
orth_variant_level=0.0,
|
orth_variant_level=0.0,
|
||||||
make_projective=False,
|
make_projective=False,
|
||||||
ignore_misaligned=False,
|
ignore_misaligned=False,
|
||||||
):
|
):
|
||||||
""" Setting gold_preproc will result in creating a doc per sentence """
|
""" Setting gold_preproc will result in creating a doc per sentence """
|
||||||
for eg_dict in annotations:
|
for eg_dict in annotations:
|
||||||
|
token_annot = eg_dict.get("token_annotation", {})
|
||||||
if eg_dict["text"]:
|
if eg_dict["text"]:
|
||||||
example = Example.from_dict(
|
doc = nlp.make_doc(eg_dict["text"])
|
||||||
nlp.make_doc(eg_dict["text"]),
|
elif "words" in token_annot:
|
||||||
eg_dict
|
doc = Doc(nlp.vocab, words=token_annot["words"])
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
example = Example.from_dict(
|
raise ValueError("Expecting either 'text' or token_annotation.words annotation")
|
||||||
Doc(nlp.vocab, words=eg_dict["words"]),
|
|
||||||
eg_dict
|
|
||||||
)
|
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
# TODO: Data augmentation
|
variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level)
|
||||||
|
doc = nlp.make_doc(variant_text)
|
||||||
|
eg_dict["token_annotation"] = variant_token_annot
|
||||||
|
example = Example.from_dict(doc, eg_dict)
|
||||||
examples = example.split_sents()
|
examples = example.split_sents()
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
example = Example.from_dict(doc, eg_dict)
|
||||||
examples = [example]
|
examples = [example]
|
||||||
|
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
if (not max_length) or len(eg.predicted) < max_length:
|
if (not max_length) or len(eg.predicted) < max_length:
|
||||||
if ignore_misaligned:
|
|
||||||
try:
|
|
||||||
_ = eg._deprecated_get_gold()
|
|
||||||
except AlignmentError:
|
|
||||||
continue
|
|
||||||
yield eg
|
yield eg
|
||||||
|
|
|
@ -126,7 +126,7 @@ cdef class Example:
|
||||||
"doc_annotation": {
|
"doc_annotation": {
|
||||||
"cats": dict(self.reference.cats),
|
"cats": dict(self.reference.cats),
|
||||||
"entities": biluo_tags_from_doc(self.reference),
|
"entities": biluo_tags_from_doc(self.reference),
|
||||||
"links": [], # TODO
|
"links": self._links_to_dict()
|
||||||
},
|
},
|
||||||
"token_annotation": {
|
"token_annotation": {
|
||||||
"ids": [t.i+1 for t in self.reference],
|
"ids": [t.i+1 for t in self.reference],
|
||||||
|
@ -141,6 +141,14 @@ cdef class Example:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _links_to_dict(self):
|
||||||
|
links = {}
|
||||||
|
for ent in self.reference.ents:
|
||||||
|
if ent.kb_id_:
|
||||||
|
links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
def split_sents(self):
|
def split_sents(self):
|
||||||
""" Split the token annotations into multiple Examples based on
|
""" Split the token annotations into multiple Examples based on
|
||||||
sent_starts and return a list of the new Examples"""
|
sent_starts and return a list of the new Examples"""
|
||||||
|
|
|
@ -646,20 +646,6 @@ class Language(object):
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def preprocess_gold(self, examples):
|
|
||||||
"""Can be called before training to pre-process gold data. By default,
|
|
||||||
it handles nonprojectivity and adds missing tags to the tag map.
|
|
||||||
|
|
||||||
examples (iterable): `Example` objects.
|
|
||||||
YIELDS (tuple): `Example` objects.
|
|
||||||
"""
|
|
||||||
# TODO: This is deprecated right?
|
|
||||||
for name, proc in self.pipeline:
|
|
||||||
if hasattr(proc, "preprocess_gold"):
|
|
||||||
examples = proc.preprocess_gold(examples)
|
|
||||||
for eg in examples:
|
|
||||||
yield eg
|
|
||||||
|
|
||||||
def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg):
|
def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg):
|
||||||
"""Allocate models, pre-process training data and acquire a trainer and
|
"""Allocate models, pre-process training data and acquire a trainer and
|
||||||
optimizer. Used as a contextmanager.
|
optimizer. Used as a contextmanager.
|
||||||
|
|
|
@ -459,9 +459,9 @@ cdef class ArcEager(TransitionSystem):
|
||||||
actions[RIGHT][label] = 1
|
actions[RIGHT][label] = 1
|
||||||
actions[REDUCE][label] = 1
|
actions[REDUCE][label] = 1
|
||||||
for example in kwargs.get('gold_parses', []):
|
for example in kwargs.get('gold_parses', []):
|
||||||
heads, labels = nonproj.projectivize(example.token_annotation.heads,
|
heads, labels = nonproj.projectivize(example.get_aligned("HEAD"),
|
||||||
example.token_annotation.deps)
|
example.get_aligned("DEP"))
|
||||||
for child, head, label in zip(example.token_annotation.ids, heads, labels):
|
for child, head, label in zip(example.get_aligned("ID"), heads, labels):
|
||||||
if label.upper() == 'ROOT' :
|
if label.upper() == 'ROOT' :
|
||||||
label = 'ROOT'
|
label = 'ROOT'
|
||||||
if head == child:
|
if head == child:
|
||||||
|
|
|
@ -78,8 +78,8 @@ def is_decorated(label):
|
||||||
def count_decorated_labels(gold_data):
|
def count_decorated_labels(gold_data):
|
||||||
freqs = {}
|
freqs = {}
|
||||||
for example in gold_data:
|
for example in gold_data:
|
||||||
proj_heads, deco_deps = projectivize(example.token_annotation.heads,
|
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
|
||||||
example.token_annotation.deps)
|
example.get_aligned("DEP"))
|
||||||
# set the label to ROOT for each root dependent
|
# set the label to ROOT for each root dependent
|
||||||
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
||||||
for i, head in enumerate(proj_heads)]
|
for i, head in enumerate(proj_heads)]
|
||||||
|
|
|
@ -11,6 +11,7 @@ import pytest
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from .util import make_tempdir
|
from .util import make_tempdir
|
||||||
|
from ..gold.augment import make_orth_variants_example
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
@ -200,13 +201,16 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
words = ["I flew", "to", "San Francisco", "Valley", "."]
|
words = ["I flew", "to", "San Francisco", "Valley", "."]
|
||||||
spaces = [True, True, True, False, False]
|
spaces = [True, True, True, False, False]
|
||||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
offset_start = len("I flew to ")
|
||||||
links = {(len("I flew to "), len("I flew to San Francisco Valley")): {"Q816843": 1.0}}
|
offset_end = len("I flew to San Francisco Valley")
|
||||||
|
entities = [(offset_start, offset_end, "LOC")]
|
||||||
|
links = {(offset_start, offset_end): {"Q816843": 1.0}}
|
||||||
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
|
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
|
||||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links})
|
example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links})
|
||||||
assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2]
|
assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2]
|
||||||
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""]
|
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""]
|
||||||
assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""]
|
assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""]
|
||||||
|
assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0}
|
||||||
|
|
||||||
# additional whitespace tokens in GoldParse words
|
# additional whitespace tokens in GoldParse words
|
||||||
words, spaces = get_words_and_spaces(
|
words, spaces = get_words_and_spaces(
|
||||||
|
@ -384,8 +388,8 @@ def test_make_orth_variants(doc):
|
||||||
goldcorpus = GoldCorpus(str(json_file), str(json_file))
|
goldcorpus = GoldCorpus(str(json_file), str(json_file))
|
||||||
|
|
||||||
# due to randomness, test only that this runs with no errors for now
|
# due to randomness, test only that this runs with no errors for now
|
||||||
train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
|
train_example = next(goldcorpus.train_dataset(nlp))
|
||||||
train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
|
variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -494,18 +498,7 @@ def test_split_sents(merged_dict):
|
||||||
Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]),
|
Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]),
|
||||||
merged_dict
|
merged_dict
|
||||||
)
|
)
|
||||||
assert len(get_parses_from_example(
|
assert example.text == "Hi there everyone It is just me"
|
||||||
example,
|
|
||||||
merge=False,
|
|
||||||
vocab=nlp.vocab,
|
|
||||||
make_projective=False)
|
|
||||||
) == 2
|
|
||||||
assert len(get_parses_from_example(
|
|
||||||
example,
|
|
||||||
merge=True,
|
|
||||||
vocab=nlp.vocab,
|
|
||||||
make_projective=False
|
|
||||||
)) == 1
|
|
||||||
|
|
||||||
split_examples = example.split_sents()
|
split_examples = example.split_sents()
|
||||||
assert len(split_examples) == 2
|
assert len(split_examples) == 2
|
||||||
|
|
Loading…
Reference in New Issue
Block a user