mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-12 07:15:48 +03:00
Merge branch 'whatif/arrow' of https://github.com/explosion/spaCy into whatif/arrow
This commit is contained in:
commit
a79f0598a6
|
@ -9,7 +9,6 @@ max_length = 0
|
|||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
|
|
|
@ -9,7 +9,6 @@ max_length = 0
|
|||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
|
|
|
@ -6,7 +6,6 @@ init_tok2vec = null
|
|||
vectors = null
|
||||
max_epochs = 100
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
use_gpu = 0
|
||||
|
|
|
@ -6,7 +6,6 @@ init_tok2vec = null
|
|||
vectors = null
|
||||
max_epochs = 100
|
||||
orth_variant_level = 0.0
|
||||
noise_level = 0.0
|
||||
gold_preproc = true
|
||||
max_length = 0
|
||||
use_gpu = -1
|
||||
|
|
|
@ -4,7 +4,7 @@ from .download import download # noqa: F401
|
|||
from .info import info # noqa: F401
|
||||
from .package import package # noqa: F401
|
||||
from .profile import profile # noqa: F401
|
||||
from .train_from_config import train_cli # noqa: F401
|
||||
from .train import train_cli # noqa: F401
|
||||
from .pretrain import pretrain # noqa: F401
|
||||
from .debug_data import debug_data # noqa: F401
|
||||
from .evaluate import evaluate # noqa: F401
|
||||
|
|
|
@ -371,7 +371,6 @@ def create_train_batches(nlp, corpus, cfg):
|
|||
train_examples = list(
|
||||
corpus.train_dataset(
|
||||
nlp,
|
||||
noise_level=cfg["noise_level"], # I think this is deprecated?
|
||||
orth_variant_level=cfg["orth_variant_level"],
|
||||
gold_preproc=cfg["gold_preproc"],
|
||||
max_length=cfg["max_length"],
|
|
@ -2,6 +2,15 @@ import random
|
|||
import itertools
|
||||
|
||||
|
||||
def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming
|
||||
raw_text = example.text
|
||||
orig_dict = example.to_dict()
|
||||
variant_text, variant_token_annot = make_orth_variants(nlp, raw_text, orig_dict["token_annotation"], orth_variant_level)
|
||||
doc = nlp.make_doc(variant_text)
|
||||
orig_dict["token_annotation"] = variant_token_annot
|
||||
return example.from_dict(doc, orig_dict)
|
||||
|
||||
|
||||
def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
||||
if random.random() >= orth_variant_level:
|
||||
return raw_text, orig_token_dict
|
||||
|
@ -98,23 +107,3 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
|||
raw_idx += 1
|
||||
raw = variant_raw
|
||||
return raw, token_dict
|
||||
|
||||
|
||||
def add_noise(orig, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
return orig
|
||||
elif type(orig) == list:
|
||||
corrupted = [_corrupt(word, noise_level) for word in orig]
|
||||
corrupted = [w for w in corrupted if w]
|
||||
return corrupted
|
||||
else:
|
||||
return "".join(_corrupt(c, noise_level) for c in orig)
|
||||
|
||||
|
||||
def _corrupt(c, noise_level):
|
||||
if random.random() >= noise_level:
|
||||
return c
|
||||
elif c in [".", "'", "!", "?", ","]:
|
||||
return "\n"
|
||||
else:
|
||||
return c.lower()
|
||||
|
|
|
@ -8,7 +8,7 @@ from ..tokens import Doc
|
|||
from .. import util
|
||||
from ..errors import Errors, AlignmentError
|
||||
from .gold_io import read_json_file, json_to_annotations
|
||||
from .augment import make_orth_variants, add_noise
|
||||
from .augment import make_orth_variants
|
||||
from .example import Example
|
||||
|
||||
|
||||
|
@ -148,7 +148,6 @@ class GoldCorpus(object):
|
|||
nlp,
|
||||
gold_preproc=False,
|
||||
max_length=None,
|
||||
noise_level=0.0,
|
||||
orth_variant_level=0.0,
|
||||
ignore_misaligned=False,
|
||||
):
|
||||
|
@ -160,7 +159,6 @@ class GoldCorpus(object):
|
|||
train_annotations,
|
||||
gold_preproc,
|
||||
max_length=max_length,
|
||||
noise_level=noise_level,
|
||||
orth_variant_level=orth_variant_level,
|
||||
make_projective=True,
|
||||
ignore_misaligned=ignore_misaligned,
|
||||
|
@ -194,33 +192,31 @@ class GoldCorpus(object):
|
|||
annotations,
|
||||
gold_preproc,
|
||||
max_length=None,
|
||||
noise_level=0.0,
|
||||
orth_variant_level=0.0,
|
||||
make_projective=False,
|
||||
ignore_misaligned=False,
|
||||
):
|
||||
""" Setting gold_preproc will result in creating a doc per sentence """
|
||||
for eg_dict in annotations:
|
||||
token_annot = eg_dict.get("token_annotation", {})
|
||||
if eg_dict["text"]:
|
||||
example = Example.from_dict(
|
||||
nlp.make_doc(eg_dict["text"]),
|
||||
eg_dict
|
||||
)
|
||||
doc = nlp.make_doc(eg_dict["text"])
|
||||
elif "words" in token_annot:
|
||||
doc = Doc(nlp.vocab, words=token_annot["words"])
|
||||
else:
|
||||
example = Example.from_dict(
|
||||
Doc(nlp.vocab, words=eg_dict["words"]),
|
||||
eg_dict
|
||||
)
|
||||
raise ValueError("Expecting either 'text' or token_annotation.words annotation")
|
||||
|
||||
if gold_preproc:
|
||||
# TODO: Data augmentation
|
||||
variant_text, variant_token_annot = make_orth_variants(nlp, doc.text, token_annot, orth_variant_level)
|
||||
doc = nlp.make_doc(variant_text)
|
||||
eg_dict["token_annotation"] = variant_token_annot
|
||||
example = Example.from_dict(doc, eg_dict)
|
||||
examples = example.split_sents()
|
||||
|
||||
else:
|
||||
example = Example.from_dict(doc, eg_dict)
|
||||
examples = [example]
|
||||
|
||||
for eg in examples:
|
||||
if (not max_length) or len(eg.predicted) < max_length:
|
||||
if ignore_misaligned:
|
||||
try:
|
||||
_ = eg._deprecated_get_gold()
|
||||
except AlignmentError:
|
||||
continue
|
||||
yield eg
|
||||
|
|
|
@ -126,7 +126,7 @@ cdef class Example:
|
|||
"doc_annotation": {
|
||||
"cats": dict(self.reference.cats),
|
||||
"entities": biluo_tags_from_doc(self.reference),
|
||||
"links": [], # TODO
|
||||
"links": self._links_to_dict()
|
||||
},
|
||||
"token_annotation": {
|
||||
"ids": [t.i+1 for t in self.reference],
|
||||
|
@ -141,6 +141,14 @@ cdef class Example:
|
|||
}
|
||||
}
|
||||
|
||||
def _links_to_dict(self):
|
||||
links = {}
|
||||
for ent in self.reference.ents:
|
||||
if ent.kb_id_:
|
||||
links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
|
||||
return links
|
||||
|
||||
|
||||
def split_sents(self):
|
||||
""" Split the token annotations into multiple Examples based on
|
||||
sent_starts and return a list of the new Examples"""
|
||||
|
|
|
@ -646,20 +646,6 @@ class Language(object):
|
|||
sgd(W, dW, key=key)
|
||||
return losses
|
||||
|
||||
def preprocess_gold(self, examples):
|
||||
"""Can be called before training to pre-process gold data. By default,
|
||||
it handles nonprojectivity and adds missing tags to the tag map.
|
||||
|
||||
examples (iterable): `Example` objects.
|
||||
YIELDS (tuple): `Example` objects.
|
||||
"""
|
||||
# TODO: This is deprecated right?
|
||||
for name, proc in self.pipeline:
|
||||
if hasattr(proc, "preprocess_gold"):
|
||||
examples = proc.preprocess_gold(examples)
|
||||
for eg in examples:
|
||||
yield eg
|
||||
|
||||
def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg):
|
||||
"""Allocate models, pre-process training data and acquire a trainer and
|
||||
optimizer. Used as a contextmanager.
|
||||
|
|
|
@ -459,9 +459,9 @@ cdef class ArcEager(TransitionSystem):
|
|||
actions[RIGHT][label] = 1
|
||||
actions[REDUCE][label] = 1
|
||||
for example in kwargs.get('gold_parses', []):
|
||||
heads, labels = nonproj.projectivize(example.token_annotation.heads,
|
||||
example.token_annotation.deps)
|
||||
for child, head, label in zip(example.token_annotation.ids, heads, labels):
|
||||
heads, labels = nonproj.projectivize(example.get_aligned("HEAD"),
|
||||
example.get_aligned("DEP"))
|
||||
for child, head, label in zip(example.get_aligned("ID"), heads, labels):
|
||||
if label.upper() == 'ROOT' :
|
||||
label = 'ROOT'
|
||||
if head == child:
|
||||
|
|
|
@ -78,8 +78,8 @@ def is_decorated(label):
|
|||
def count_decorated_labels(gold_data):
|
||||
freqs = {}
|
||||
for example in gold_data:
|
||||
proj_heads, deco_deps = projectivize(example.token_annotation.heads,
|
||||
example.token_annotation.deps)
|
||||
proj_heads, deco_deps = projectivize(example.get_aligned("HEAD"),
|
||||
example.get_aligned("DEP"))
|
||||
# set the label to ROOT for each root dependent
|
||||
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
||||
for i, head in enumerate(proj_heads)]
|
||||
|
|
|
@ -11,6 +11,7 @@ import pytest
|
|||
import srsly
|
||||
|
||||
from .util import make_tempdir
|
||||
from ..gold.augment import make_orth_variants_example
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -200,13 +201,16 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
|||
words = ["I flew", "to", "San Francisco", "Valley", "."]
|
||||
spaces = [True, True, True, False, False]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
links = {(len("I flew to "), len("I flew to San Francisco Valley")): {"Q816843": 1.0}}
|
||||
offset_start = len("I flew to ")
|
||||
offset_end = len("I flew to San Francisco Valley")
|
||||
entities = [(offset_start, offset_end, "LOC")]
|
||||
links = {(offset_start, offset_end): {"Q816843": 1.0}}
|
||||
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
|
||||
example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links})
|
||||
assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2]
|
||||
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""]
|
||||
assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""]
|
||||
assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0}
|
||||
|
||||
# additional whitespace tokens in GoldParse words
|
||||
words, spaces = get_words_and_spaces(
|
||||
|
@ -384,8 +388,8 @@ def test_make_orth_variants(doc):
|
|||
goldcorpus = GoldCorpus(str(json_file), str(json_file))
|
||||
|
||||
# due to randomness, test only that this runs with no errors for now
|
||||
train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2))
|
||||
train_goldparse = get_parses_from_example(train_reloaded_example)[0][1]
|
||||
train_example = next(goldcorpus.train_dataset(nlp))
|
||||
variant_example = make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -494,18 +498,7 @@ def test_split_sents(merged_dict):
|
|||
Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]),
|
||||
merged_dict
|
||||
)
|
||||
assert len(get_parses_from_example(
|
||||
example,
|
||||
merge=False,
|
||||
vocab=nlp.vocab,
|
||||
make_projective=False)
|
||||
) == 2
|
||||
assert len(get_parses_from_example(
|
||||
example,
|
||||
merge=True,
|
||||
vocab=nlp.vocab,
|
||||
make_projective=False
|
||||
)) == 1
|
||||
assert example.text == "Hi there everyone It is just me"
|
||||
|
||||
split_examples = example.split_sents()
|
||||
assert len(split_examples) == 2
|
||||
|
|
Loading…
Reference in New Issue
Block a user