mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Example class for training data (#4543)
* OrigAnnot class instead of gold.orig_annot list of zipped tuples * from_orig to replace from_annot_tuples * rename to RawAnnot * some unit tests for GoldParse creation and internal format * removing orig_annot and switching to lists instead of tuple * rewriting tuples to use RawAnnot (+ debug statements, WIP) * fix pop() changing the data * small fixes * pop-append fixes * return RawAnnot for existing GoldParse to have uniform interface * clean up imports * fix merge_sents * add unit test for 4402 with new structure (not working yet) * introduce DocAnnot * typo fixes * add unit test for merge_sents * rename from_orig to from_raw * fixing unit tests * fix nn parser * read_annots to produce text, doc_annot pairs * _make_golds fix * rename golds_to_gold_annots * small fixes * fix encoding * have golds_to_gold_annots use DocAnnot * missed a spot * merge_sents as function in DocAnnot * allow specifying only part of the token-level annotations * refactor with Example class + underlying dicts * pipeline components to work with Example objects (wip) * input checking * fix yielding * fix calls to update * small fixes * fix scorer unit test with new format * fix kwargs order * fixes for ud and conllu scripts * fix reading data for conllu script * add in proper errors (not fixed numbering yet to avoid merge conflicts) * fixing few more small bugs * fix EL script
This commit is contained in:
parent
56ad3a3988
commit
e48a09df4e
|
@ -13,23 +13,12 @@ import srsly
|
||||||
import spacy
|
import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
from spacy.tokens import Token, Doc
|
from spacy.tokens import Token, Doc
|
||||||
from spacy.gold import GoldParse
|
|
||||||
from spacy.util import compounding, minibatch_by_words
|
|
||||||
from spacy.syntax.nonproj import projectivize
|
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
# from spacy.morphology import Fused_begin, Fused_inside
|
|
||||||
from spacy import displacy
|
|
||||||
from collections import defaultdict, Counter
|
|
||||||
from timeit import default_timer as timer
|
|
||||||
|
|
||||||
Fused_begin = None
|
Fused_begin = None
|
||||||
Fused_inside = None
|
Fused_inside = None
|
||||||
|
|
||||||
import itertools
|
|
||||||
import random
|
|
||||||
import numpy.random
|
|
||||||
|
|
||||||
from . import conll17_ud_eval
|
from . import conll17_ud_eval
|
||||||
|
|
||||||
from spacy import lang
|
from spacy import lang
|
||||||
|
@ -268,7 +257,7 @@ def load_nlp(experiments_dir, corpus):
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
def initialize_pipeline(nlp, examples, config, device):
|
||||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
|
@ -7,24 +7,20 @@ from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import sys
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
from bin.ud import conll17_ud_eval
|
from bin.ud import conll17_ud_eval
|
||||||
from spacy.tokens import Token, Doc
|
from spacy.tokens import Token, Doc
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse, Example
|
||||||
from spacy.util import compounding, minibatch, minibatch_by_words
|
from spacy.util import compounding, minibatch, minibatch_by_words
|
||||||
from spacy.syntax.nonproj import projectivize
|
from spacy.syntax.nonproj import projectivize
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
from spacy import displacy
|
from spacy import displacy
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict
|
||||||
from timeit import default_timer as timer
|
|
||||||
|
|
||||||
import itertools
|
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
|
||||||
|
|
||||||
from spacy import lang
|
from spacy import lang
|
||||||
from spacy.lang import zh
|
from spacy.lang import zh
|
||||||
|
@ -56,7 +52,7 @@ def read_data(
|
||||||
max_doc_length=None,
|
max_doc_length=None,
|
||||||
limit=None,
|
limit=None,
|
||||||
):
|
):
|
||||||
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
"""Read the CONLLU format into Example objects. If raw_text=True,
|
||||||
include Doc objects created using nlp.make_doc and then aligned against
|
include Doc objects created using nlp.make_doc and then aligned against
|
||||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||||
created from the gold-standard segments. At least one must be True."""
|
created from the gold-standard segments. At least one must be True."""
|
||||||
|
@ -101,15 +97,16 @@ def read_data(
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
golds.append(gold)
|
golds.append(gold)
|
||||||
if limit and len(docs) >= limit:
|
if limit and len(docs) >= limit:
|
||||||
return docs, golds
|
return golds_to_gold_data(docs, golds)
|
||||||
|
|
||||||
if raw_text and sent_annots:
|
if raw_text and sent_annots:
|
||||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
doc, gold = _make_gold(nlp, None, sent_annots)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
golds.append(gold)
|
golds.append(gold)
|
||||||
if limit and len(docs) >= limit:
|
if limit and len(docs) >= limit:
|
||||||
return docs, golds
|
return golds_to_gold_data(docs, golds)
|
||||||
return docs, golds
|
return golds_to_gold_data(docs, golds)
|
||||||
|
|
||||||
|
|
||||||
def _parse_morph_string(morph_string):
|
def _parse_morph_string(morph_string):
|
||||||
if morph_string == '_':
|
if morph_string == '_':
|
||||||
|
@ -123,6 +120,7 @@ def _parse_morph_string(morph_string):
|
||||||
output.append('%s_%s' % (key, value.lower()))
|
output.append('%s_%s' % (key, value.lower()))
|
||||||
return set(output)
|
return set(output)
|
||||||
|
|
||||||
|
|
||||||
def read_conllu(file_):
|
def read_conllu(file_):
|
||||||
docs = []
|
docs = []
|
||||||
sent = []
|
sent = []
|
||||||
|
@ -183,16 +181,18 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
||||||
#############################
|
#############################
|
||||||
|
|
||||||
|
|
||||||
def golds_to_gold_tuples(docs, golds):
|
def golds_to_gold_data(docs, golds):
|
||||||
"""Get out the annoying 'tuples' format used by begin_training, given the
|
"""Get out the training data format used by begin_training, given the
|
||||||
GoldParse objects."""
|
GoldParse objects."""
|
||||||
tuples = []
|
data = []
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
text = doc.text
|
example = Example(doc=doc)
|
||||||
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
|
example.add_doc_annotation(cats=gold.cats)
|
||||||
sents = [((ids, words, tags, heads, labels, iob), [])]
|
token_annotation_dict = gold.orig.to_dict()
|
||||||
tuples.append((text, sents))
|
example.add_token_annotation(**token_annotation_dict)
|
||||||
return tuples
|
example.goldparse = gold
|
||||||
|
data.append(example)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
##############
|
##############
|
||||||
|
@ -348,7 +348,7 @@ def load_nlp(corpus, config, vectors=None):
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config, device):
|
def initialize_pipeline(nlp, examples, config, device):
|
||||||
nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
|
nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
|
||||||
nlp.add_pipe(nlp.create_pipe("morphologizer"))
|
nlp.add_pipe(nlp.create_pipe("morphologizer"))
|
||||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||||
|
@ -356,14 +356,15 @@ def initialize_pipeline(nlp, docs, golds, config, device):
|
||||||
nlp.parser.add_multitask_objective("tag")
|
nlp.parser.add_multitask_objective("tag")
|
||||||
if config.multitask_sent:
|
if config.multitask_sent:
|
||||||
nlp.parser.add_multitask_objective("sent_start")
|
nlp.parser.add_multitask_objective("sent_start")
|
||||||
for gold in golds:
|
for ex in examples:
|
||||||
|
gold = ex.gold
|
||||||
for tag in gold.tags:
|
for tag in gold.tags:
|
||||||
if tag is not None:
|
if tag is not None:
|
||||||
nlp.tagger.add_label(tag)
|
nlp.tagger.add_label(tag)
|
||||||
if torch is not None and device != -1:
|
if torch is not None and device != -1:
|
||||||
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
torch.set_default_tensor_type("torch.cuda.FloatTensor")
|
||||||
optimizer = nlp.begin_training(
|
optimizer = nlp.begin_training(
|
||||||
lambda: golds_to_gold_tuples(docs, golds),
|
lambda: examples,
|
||||||
device=device,
|
device=device,
|
||||||
subword_features=config.subword_features,
|
subword_features=config.subword_features,
|
||||||
conv_depth=config.conv_depth,
|
conv_depth=config.conv_depth,
|
||||||
|
@ -504,20 +505,20 @@ def main(
|
||||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||||
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
nlp = load_nlp(paths.lang, config, vectors=vectors_dir)
|
||||||
|
|
||||||
docs, golds = read_data(
|
examples = read_data(
|
||||||
nlp,
|
nlp,
|
||||||
paths.train.conllu.open(),
|
paths.train.conllu.open(encoding="utf8"),
|
||||||
paths.train.text.open(),
|
paths.train.text.open(encoding="utf8"),
|
||||||
max_doc_length=config.max_doc_length,
|
max_doc_length=config.max_doc_length,
|
||||||
limit=limit,
|
limit=limit,
|
||||||
)
|
)
|
||||||
|
|
||||||
optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)
|
optimizer = initialize_pipeline(nlp, examples, config, gpu_device)
|
||||||
|
|
||||||
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
|
batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
|
||||||
beam_prob = compounding(0.2, 0.8, 1.001)
|
beam_prob = compounding(0.2, 0.8, 1.001)
|
||||||
for i in range(config.nr_epoch):
|
for i in range(config.nr_epoch):
|
||||||
docs, golds = read_data(
|
examples = read_data(
|
||||||
nlp,
|
nlp,
|
||||||
paths.train.conllu.open(encoding="utf8"),
|
paths.train.conllu.open(encoding="utf8"),
|
||||||
paths.train.text.open(encoding="utf8"),
|
paths.train.text.open(encoding="utf8"),
|
||||||
|
@ -526,22 +527,19 @@ def main(
|
||||||
oracle_segments=use_oracle_segments,
|
oracle_segments=use_oracle_segments,
|
||||||
raw_text=not use_oracle_segments,
|
raw_text=not use_oracle_segments,
|
||||||
)
|
)
|
||||||
Xs = list(zip(docs, golds))
|
random.shuffle(examples)
|
||||||
random.shuffle(Xs)
|
|
||||||
if config.batch_by_words:
|
if config.batch_by_words:
|
||||||
batches = minibatch_by_words(Xs, size=batch_sizes)
|
batches = minibatch_by_words(examples, size=batch_sizes)
|
||||||
else:
|
else:
|
||||||
batches = minibatch(Xs, size=batch_sizes)
|
batches = minibatch(examples, size=batch_sizes)
|
||||||
losses = {}
|
losses = {}
|
||||||
n_train_words = sum(len(doc) for doc in docs)
|
n_train_words = sum(len(ex.doc) for ex in examples)
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
batch_docs, batch_gold = zip(*batch)
|
pbar.update(sum(len(ex.doc) for ex in batch))
|
||||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
|
||||||
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
|
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
|
||||||
nlp.update(
|
nlp.update(
|
||||||
batch_docs,
|
batch,
|
||||||
batch_gold,
|
|
||||||
sgd=optimizer,
|
sgd=optimizer,
|
||||||
drop=config.dropout,
|
drop=config.dropout,
|
||||||
losses=losses,
|
losses=losses,
|
||||||
|
|
|
@ -46,7 +46,7 @@ def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_fre
|
||||||
" cf. https://spacy.io/usage/models#languages."
|
" cf. https://spacy.io/usage/models#languages."
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq))
|
logger.info("Filtering entities with fewer than {} mentions or no description".format(min_entity_freq))
|
||||||
entity_frequencies = io.read_entity_to_count(entity_freq_path)
|
entity_frequencies = io.read_entity_to_count(entity_freq_path)
|
||||||
# filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
|
# filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
|
||||||
filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
|
filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
|
||||||
|
|
|
@ -131,10 +131,8 @@ def main(
|
||||||
with nlp.disable_pipes(*other_pipes):
|
with nlp.disable_pipes(*other_pipes):
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
try:
|
try:
|
||||||
docs, golds = zip(*batch)
|
|
||||||
nlp.update(
|
nlp.update(
|
||||||
docs=docs,
|
examples=batch,
|
||||||
golds=golds,
|
|
||||||
sgd=optimizer,
|
sgd=optimizer,
|
||||||
drop=dropout,
|
drop=dropout,
|
||||||
losses=losses,
|
losses=losses,
|
||||||
|
|
|
@ -11,10 +11,9 @@ import json
|
||||||
import spacy
|
import spacy
|
||||||
import spacy.util
|
import spacy.util
|
||||||
from spacy.tokens import Token, Doc
|
from spacy.tokens import Token, Doc
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse, Example
|
||||||
from spacy.syntax.nonproj import projectivize
|
from spacy.syntax.nonproj import projectivize
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict
|
||||||
from timeit import default_timer as timer
|
|
||||||
from spacy.matcher import Matcher
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
|
@ -33,25 +32,25 @@ random.seed(0)
|
||||||
numpy.random.seed(0)
|
numpy.random.seed(0)
|
||||||
|
|
||||||
|
|
||||||
def minibatch_by_words(items, size=5000):
|
def minibatch_by_words(examples, size=5000):
|
||||||
random.shuffle(items)
|
random.shuffle(examples)
|
||||||
if isinstance(size, int):
|
if isinstance(size, int):
|
||||||
size_ = itertools.repeat(size)
|
size_ = itertools.repeat(size)
|
||||||
else:
|
else:
|
||||||
size_ = size
|
size_ = size
|
||||||
items = iter(items)
|
examples = iter(examples)
|
||||||
while True:
|
while True:
|
||||||
batch_size = next(size_)
|
batch_size = next(size_)
|
||||||
batch = []
|
batch = []
|
||||||
while batch_size >= 0:
|
while batch_size >= 0:
|
||||||
try:
|
try:
|
||||||
doc, gold = next(items)
|
example = next(examples)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
if batch:
|
if batch:
|
||||||
yield batch
|
yield batch
|
||||||
return
|
return
|
||||||
batch_size -= len(doc)
|
batch_size -= len(example.doc)
|
||||||
batch.append((doc, gold))
|
batch.append(example)
|
||||||
if batch:
|
if batch:
|
||||||
yield batch
|
yield batch
|
||||||
else:
|
else:
|
||||||
|
@ -78,7 +77,7 @@ def read_data(
|
||||||
max_doc_length=None,
|
max_doc_length=None,
|
||||||
limit=None,
|
limit=None,
|
||||||
):
|
):
|
||||||
"""Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
|
"""Read the CONLLU format into Example objects. If raw_text=True,
|
||||||
include Doc objects created using nlp.make_doc and then aligned against
|
include Doc objects created using nlp.make_doc and then aligned against
|
||||||
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
the gold-standard sequences. If oracle_segments=True, include Doc objects
|
||||||
created from the gold-standard segments. At least one must be True."""
|
created from the gold-standard segments. At least one must be True."""
|
||||||
|
@ -119,15 +118,15 @@ def read_data(
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
golds.append(gold)
|
golds.append(gold)
|
||||||
if limit and len(docs) >= limit:
|
if limit and len(docs) >= limit:
|
||||||
return docs, golds
|
return golds_to_gold_data(docs, golds)
|
||||||
|
|
||||||
if raw_text and sent_annots:
|
if raw_text and sent_annots:
|
||||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
doc, gold = _make_gold(nlp, None, sent_annots)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
golds.append(gold)
|
golds.append(gold)
|
||||||
if limit and len(docs) >= limit:
|
if limit and len(docs) >= limit:
|
||||||
return docs, golds
|
return golds_to_gold_data(docs, golds)
|
||||||
return docs, golds
|
return golds_to_gold_data(docs, golds)
|
||||||
|
|
||||||
|
|
||||||
def read_conllu(file_):
|
def read_conllu(file_):
|
||||||
|
@ -181,16 +180,18 @@ def _make_gold(nlp, text, sent_annots):
|
||||||
#############################
|
#############################
|
||||||
|
|
||||||
|
|
||||||
def golds_to_gold_tuples(docs, golds):
|
def golds_to_gold_data(docs, golds):
|
||||||
"""Get out the annoying 'tuples' format used by begin_training, given the
|
"""Get out the training data format used by begin_training, given the
|
||||||
GoldParse objects."""
|
GoldParse objects."""
|
||||||
tuples = []
|
data = []
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
text = doc.text
|
example = Example(doc=doc)
|
||||||
ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
|
example.add_doc_annotation(cats=gold.cats)
|
||||||
sents = [((ids, words, tags, heads, labels, iob), [])]
|
token_annotation_dict = gold.orig.to_dict()
|
||||||
tuples.append((text, sents))
|
example.add_token_annotation(**token_annotation_dict)
|
||||||
return tuples
|
example.goldparse = gold
|
||||||
|
data.append(example)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
##############
|
##############
|
||||||
|
@ -290,9 +291,9 @@ def get_token_conllu(token, i):
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
|
||||||
Token.set_extension("begins_fused", default=False)
|
Token.set_extension("begins_fused", default=False, force=True)
|
||||||
Token.set_extension("inside_fused", default=False)
|
Token.set_extension("inside_fused", default=False, force=True)
|
||||||
|
|
||||||
|
|
||||||
##################
|
##################
|
||||||
|
@ -308,7 +309,7 @@ def load_nlp(corpus, config):
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
def initialize_pipeline(nlp, docs, golds, config):
|
def initialize_pipeline(nlp, examples, config):
|
||||||
nlp.add_pipe(nlp.create_pipe("parser"))
|
nlp.add_pipe(nlp.create_pipe("parser"))
|
||||||
if config.multitask_tag:
|
if config.multitask_tag:
|
||||||
nlp.parser.add_multitask_objective("tag")
|
nlp.parser.add_multitask_objective("tag")
|
||||||
|
@ -316,18 +317,19 @@ def initialize_pipeline(nlp, docs, golds, config):
|
||||||
nlp.parser.add_multitask_objective("sent_start")
|
nlp.parser.add_multitask_objective("sent_start")
|
||||||
nlp.parser.moves.add_action(2, "subtok")
|
nlp.parser.moves.add_action(2, "subtok")
|
||||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||||
for gold in golds:
|
for ex in examples:
|
||||||
for tag in gold.tags:
|
for tag in ex.gold.tags:
|
||||||
if tag is not None:
|
if tag is not None:
|
||||||
nlp.tagger.add_label(tag)
|
nlp.tagger.add_label(tag)
|
||||||
# Replace labels that didn't make the frequency cutoff
|
# Replace labels that didn't make the frequency cutoff
|
||||||
actions = set(nlp.parser.labels)
|
actions = set(nlp.parser.labels)
|
||||||
label_set = set([act.split("-")[1] for act in actions if "-" in act])
|
label_set = set([act.split("-")[1] for act in actions if "-" in act])
|
||||||
for gold in golds:
|
for ex in examples:
|
||||||
|
gold = ex.gold
|
||||||
for i, label in enumerate(gold.labels):
|
for i, label in enumerate(gold.labels):
|
||||||
if label is not None and label not in label_set:
|
if label is not None and label not in label_set:
|
||||||
gold.labels[i] = label.split("||")[0]
|
gold.labels[i] = label.split("||")[0]
|
||||||
return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))
|
return nlp.begin_training(lambda: examples)
|
||||||
|
|
||||||
|
|
||||||
########################
|
########################
|
||||||
|
@ -401,28 +403,26 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||||
print("Train and evaluate", corpus, "using lang", paths.lang)
|
print("Train and evaluate", corpus, "using lang", paths.lang)
|
||||||
nlp = load_nlp(paths.lang, config)
|
nlp = load_nlp(paths.lang, config)
|
||||||
|
|
||||||
docs, golds = read_data(
|
examples = read_data(
|
||||||
nlp,
|
nlp,
|
||||||
paths.train.conllu.open(),
|
paths.train.conllu.open(encoding="utf8"),
|
||||||
paths.train.text.open(),
|
paths.train.text.open(encoding="utf8"),
|
||||||
max_doc_length=config.max_doc_length,
|
max_doc_length=config.max_doc_length,
|
||||||
limit=limit,
|
limit=limit,
|
||||||
)
|
)
|
||||||
|
|
||||||
optimizer = initialize_pipeline(nlp, docs, golds, config)
|
optimizer = initialize_pipeline(nlp, examples, config)
|
||||||
|
|
||||||
for i in range(config.nr_epoch):
|
for i in range(config.nr_epoch):
|
||||||
docs = [nlp.make_doc(doc.text) for doc in docs]
|
docs = [nlp.make_doc(example.doc.text) for example in examples]
|
||||||
batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
|
batches = minibatch_by_words(examples, size=config.batch_size)
|
||||||
losses = {}
|
losses = {}
|
||||||
n_train_words = sum(len(doc) for doc in docs)
|
n_train_words = sum(len(doc) for doc in docs)
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
batch_docs, batch_gold = zip(*batch)
|
pbar.update(sum(len(ex.doc) for ex in batch))
|
||||||
pbar.update(sum(len(doc) for doc in batch_docs))
|
|
||||||
nlp.update(
|
nlp.update(
|
||||||
batch_docs,
|
examples=batch,
|
||||||
batch_gold,
|
|
||||||
sgd=optimizer,
|
sgd=optimizer,
|
||||||
drop=config.dropout,
|
drop=config.dropout,
|
||||||
losses=losses,
|
losses=losses,
|
||||||
|
|
|
@ -31,14 +31,13 @@ random.seed(0)
|
||||||
|
|
||||||
PWD = os.path.dirname(__file__)
|
PWD = os.path.dirname(__file__)
|
||||||
|
|
||||||
TRAIN_DATA = list(read_json_file(
|
TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
|
||||||
os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json")))
|
|
||||||
|
|
||||||
|
|
||||||
def get_position_label(i, words, tags, heads, labels, ents):
|
def get_position_label(i, token_annotation):
|
||||||
"""Return labels indicating the position of the word in the document.
|
"""Return labels indicating the position of the word in the document.
|
||||||
"""
|
"""
|
||||||
if len(words) < 20:
|
if len(token_annotation.words) < 20:
|
||||||
return "short-doc"
|
return "short-doc"
|
||||||
elif i == 0:
|
elif i == 0:
|
||||||
return "first-word"
|
return "first-word"
|
||||||
|
@ -46,7 +45,7 @@ def get_position_label(i, words, tags, heads, labels, ents):
|
||||||
return "early-word"
|
return "early-word"
|
||||||
elif i < 20:
|
elif i < 20:
|
||||||
return "mid-word"
|
return "mid-word"
|
||||||
elif i == len(words) - 1:
|
elif i == len(token_annotation.words) - 1:
|
||||||
return "last-word"
|
return "last-word"
|
||||||
else:
|
else:
|
||||||
return "late-word"
|
return "late-word"
|
||||||
|
@ -60,17 +59,17 @@ def main(n_iter=10):
|
||||||
print(nlp.pipeline)
|
print(nlp.pipeline)
|
||||||
|
|
||||||
print("Create data", len(TRAIN_DATA))
|
print("Create data", len(TRAIN_DATA))
|
||||||
optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
|
optimizer = nlp.begin_training(get_examples=lambda: TRAIN_DATA)
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
losses = {}
|
losses = {}
|
||||||
for text, annot_brackets in TRAIN_DATA:
|
for example in TRAIN_DATA:
|
||||||
for annotations, _ in annot_brackets:
|
for token_annotation in example.token_annotations:
|
||||||
doc = Doc(nlp.vocab, words=annotations[1])
|
doc = Doc(nlp.vocab, words=token_annotation.words)
|
||||||
gold = GoldParse.from_annot_tuples(doc, annotations)
|
gold = GoldParse.from_annotation(doc, example.doc_annotation, token_annotation)
|
||||||
|
|
||||||
nlp.update(
|
nlp.update(
|
||||||
[doc], # batch of texts
|
examples=[(doc, gold)], # 1 example
|
||||||
[gold], # batch of annotations
|
|
||||||
drop=0.2, # dropout - make it harder to memorise data
|
drop=0.2, # dropout - make it harder to memorise data
|
||||||
sgd=optimizer, # callable to update weights
|
sgd=optimizer, # callable to update weights
|
||||||
losses=losses,
|
losses=losses,
|
||||||
|
@ -78,9 +77,9 @@ def main(n_iter=10):
|
||||||
print(losses.get("nn_labeller", 0.0), losses["ner"])
|
print(losses.get("nn_labeller", 0.0), losses["ner"])
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
for text, _ in TRAIN_DATA:
|
for example in TRAIN_DATA:
|
||||||
if text is not None:
|
if example.text is not None:
|
||||||
doc = nlp(text)
|
doc = nlp(example.text)
|
||||||
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
|
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
|
||||||
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
|
||||||
|
|
||||||
|
|
|
@ -116,7 +116,7 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||||
losses = {}
|
losses = {}
|
||||||
for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
|
for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
|
||||||
docs = [nlp.make_doc(text) for text in batch]
|
docs = [nlp.make_doc(text) for text in batch]
|
||||||
tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)
|
tensorizer.update((docs, None), losses=losses, sgd=optimizer, drop=dropout)
|
||||||
print(losses)
|
print(losses)
|
||||||
return optimizer
|
return optimizer
|
||||||
|
|
||||||
|
@ -147,8 +147,7 @@ def train_textcat(nlp, n_texts, n_iter=10):
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(tqdm.tqdm(train_data), size=2)
|
batches = minibatch(tqdm.tqdm(train_data), size=2)
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
|
|
||||||
with textcat.model.use_params(optimizer.averages):
|
with textcat.model.use_params(optimizer.averages):
|
||||||
# evaluate on the dev data split off in load_data()
|
# evaluate on the dev data split off in load_data()
|
||||||
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||||
|
|
|
@ -74,8 +74,7 @@ def main(model_name, unlabelled_loc):
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
raw_batches = minibatch(raw_docs, size=4)
|
raw_batches = minibatch(raw_docs, size=4)
|
||||||
for batch in minibatch(TRAIN_DATA, size=sizes):
|
for batch in minibatch(TRAIN_DATA, size=sizes):
|
||||||
docs, golds = zip(*batch)
|
nlp.update(batch, sgd=optimizer, drop=dropout, losses=losses)
|
||||||
nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
|
|
||||||
raw_batch = list(next(raw_batches))
|
raw_batch = list(next(raw_batches))
|
||||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
|
nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
|
||||||
print("Losses", losses)
|
print("Losses", losses)
|
||||||
|
|
|
@ -108,10 +108,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(
|
nlp.update(
|
||||||
texts, # batch of texts
|
batch,
|
||||||
annotations, # batch of annotations
|
|
||||||
drop=0.2, # dropout - make it harder to memorise data
|
drop=0.2, # dropout - make it harder to memorise data
|
||||||
losses=losses,
|
losses=losses,
|
||||||
sgd=optimizer,
|
sgd=optimizer,
|
||||||
|
|
|
@ -133,8 +133,7 @@ def main(model=None, output_dir=None, n_iter=15):
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
|
||||||
print("Losses", losses)
|
print("Losses", losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
|
|
|
@ -67,10 +67,8 @@ def main(model=None, output_dir=None, n_iter=100):
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(
|
nlp.update(
|
||||||
texts, # batch of texts
|
batch,
|
||||||
annotations, # batch of annotations
|
|
||||||
drop=0.5, # dropout - make it harder to memorise data
|
drop=0.5, # dropout - make it harder to memorise data
|
||||||
losses=losses,
|
losses=losses,
|
||||||
)
|
)
|
||||||
|
|
|
@ -104,8 +104,7 @@ def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
|
||||||
batches = minibatch(TRAIN_DATA, size=sizes)
|
batches = minibatch(TRAIN_DATA, size=sizes)
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
nlp.update(batch, sgd=optimizer, drop=0.35, losses=losses)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
|
|
||||||
print("Losses", losses)
|
print("Losses", losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
|
|
|
@ -74,8 +74,7 @@ def main(model=None, output_dir=None, n_iter=15):
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
|
||||||
print("Losses", losses)
|
print("Losses", losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
|
|
|
@ -65,8 +65,7 @@ def main(lang="en", output_dir=None, n_iter=25):
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
|
||||||
print("Losses", losses)
|
print("Losses", losses)
|
||||||
|
|
||||||
# test the trained model
|
# test the trained model
|
||||||
|
|
|
@ -82,8 +82,7 @@ def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
batches = minibatch(train_data, size=batch_sizes)
|
batches = minibatch(train_data, size=batch_sizes)
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
nlp.update(batch, sgd=optimizer, drop=0.2, losses=losses)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
|
|
||||||
with textcat.model.use_params(optimizer.averages):
|
with textcat.model.use_params(optimizer.averages):
|
||||||
# evaluate on the dev data split off in load_data()
|
# evaluate on the dev data split off in load_data()
|
||||||
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from spacy.gold import Example
|
||||||
from ...gold import iob_to_biluo
|
from ...gold import iob_to_biluo
|
||||||
|
|
||||||
|
|
||||||
|
@ -19,15 +20,15 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
|
||||||
# by @katarkor
|
# by @katarkor
|
||||||
docs = []
|
docs = []
|
||||||
sentences = []
|
sentences = []
|
||||||
conll_tuples = read_conllx(input_data, use_morphology=use_morphology)
|
conll_data = read_conllx(input_data, use_morphology=use_morphology)
|
||||||
checked_for_ner = False
|
checked_for_ner = False
|
||||||
has_ner_tags = False
|
has_ner_tags = False
|
||||||
for i, (raw_text, tokens) in enumerate(conll_tuples):
|
for i, example in enumerate(conll_data):
|
||||||
sentence, brackets = tokens[0]
|
for token_annotation in example.token_annotations:
|
||||||
if not checked_for_ner:
|
if not checked_for_ner:
|
||||||
has_ner_tags = is_ner(sentence[5][0])
|
has_ner_tags = is_ner(token_annotation.entities[0])
|
||||||
checked_for_ner = True
|
checked_for_ner = True
|
||||||
sentences.append(generate_sentence(sentence, has_ner_tags))
|
sentences.append(generate_sentence(token_annotation, has_ner_tags))
|
||||||
# Real-sized documents could be extracted using the comments on the
|
# Real-sized documents could be extracted using the comments on the
|
||||||
# conluu document
|
# conluu document
|
||||||
if len(sentences) % n_sents == 0:
|
if len(sentences) % n_sents == 0:
|
||||||
|
@ -52,15 +53,15 @@ def is_ner(tag):
|
||||||
|
|
||||||
|
|
||||||
def read_conllx(input_data, use_morphology=False, n=0):
|
def read_conllx(input_data, use_morphology=False, n=0):
|
||||||
|
""" Yield example data points, one for each sentence """
|
||||||
i = 0
|
i = 0
|
||||||
for sent in input_data.strip().split("\n\n"):
|
for sent in input_data.strip().split("\n\n"):
|
||||||
lines = sent.strip().split("\n")
|
lines = sent.strip().split("\n")
|
||||||
if lines:
|
if lines:
|
||||||
while lines[0].startswith("#"):
|
while lines[0].startswith("#"):
|
||||||
lines.pop(0)
|
lines.pop(0)
|
||||||
tokens = []
|
ids, words, tags, heads, deps, ents = [], [], [], [], [], []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
|
||||||
parts = line.split("\t")
|
parts = line.split("\t")
|
||||||
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
|
id_, word, lemma, pos, tag, morph, head, dep, _1, iob = parts
|
||||||
if "-" in id_ or "." in id_:
|
if "-" in id_ or "." in id_:
|
||||||
|
@ -72,14 +73,22 @@ def read_conllx(input_data, use_morphology=False, n=0):
|
||||||
tag = pos if tag == "_" else tag
|
tag = pos if tag == "_" else tag
|
||||||
tag = tag + "__" + morph if use_morphology else tag
|
tag = tag + "__" + morph if use_morphology else tag
|
||||||
iob = iob if iob else "O"
|
iob = iob if iob else "O"
|
||||||
tokens.append((id_, word, tag, head, dep, iob))
|
|
||||||
|
ids.append(id_)
|
||||||
|
words.append(word)
|
||||||
|
tags.append(tag)
|
||||||
|
heads.append(head)
|
||||||
|
deps.append(dep)
|
||||||
|
ents.append(iob)
|
||||||
except: # noqa: E722
|
except: # noqa: E722
|
||||||
print(line)
|
print(line)
|
||||||
raise
|
raise
|
||||||
tuples = [list(t) for t in zip(*tokens)]
|
example = Example(doc=None)
|
||||||
yield (None, [[tuples, []]])
|
example.add_token_annotation(ids=ids, words=words, tags=tags,
|
||||||
|
heads=heads, deps=deps, entities=ents)
|
||||||
|
yield example
|
||||||
i += 1
|
i += 1
|
||||||
if n >= 1 and i >= n:
|
if 1 <= n <= i:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
@ -107,20 +116,19 @@ def simplify_tags(iob):
|
||||||
return new_iob
|
return new_iob
|
||||||
|
|
||||||
|
|
||||||
def generate_sentence(sent, has_ner_tags):
|
def generate_sentence(token_annotation, has_ner_tags):
|
||||||
(id_, word, tag, head, dep, iob) = sent
|
|
||||||
sentence = {}
|
sentence = {}
|
||||||
tokens = []
|
tokens = []
|
||||||
if has_ner_tags:
|
if has_ner_tags:
|
||||||
iob = simplify_tags(iob)
|
iob = simplify_tags(token_annotation.entities)
|
||||||
biluo = iob_to_biluo(iob)
|
biluo = iob_to_biluo(iob)
|
||||||
for i, id in enumerate(id_):
|
for i, id in enumerate(token_annotation.ids):
|
||||||
token = {}
|
token = {}
|
||||||
token["id"] = id
|
token["id"] = id
|
||||||
token["orth"] = word[i]
|
token["orth"] = token_annotation.words[i]
|
||||||
token["tag"] = tag[i]
|
token["tag"] = token_annotation.tags[i]
|
||||||
token["head"] = head[i] - id
|
token["head"] = token_annotation.heads[i] - id
|
||||||
token["dep"] = dep[i]
|
token["dep"] = token_annotation.deps[i]
|
||||||
if has_ner_tags:
|
if has_ner_tags:
|
||||||
token["ner"] = biluo[i]
|
token["ner"] = biluo[i]
|
||||||
tokens.append(token)
|
tokens.append(token)
|
||||||
|
|
|
@ -80,16 +80,16 @@ def debug_data(
|
||||||
with msg.loading("Loading corpus..."):
|
with msg.loading("Loading corpus..."):
|
||||||
corpus = GoldCorpus(train_path, dev_path)
|
corpus = GoldCorpus(train_path, dev_path)
|
||||||
try:
|
try:
|
||||||
train_docs = list(corpus.train_docs(nlp))
|
train_dataset = list(corpus.train_dataset(nlp))
|
||||||
train_docs_unpreprocessed = list(
|
train_dataset_unpreprocessed = list(
|
||||||
corpus.train_docs_without_preprocessing(nlp)
|
corpus.train_dataset_without_preprocessing(nlp)
|
||||||
)
|
)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
loading_train_error_message = "Training data cannot be loaded: {}".format(
|
loading_train_error_message = "Training data cannot be loaded: {}".format(
|
||||||
str(e)
|
str(e)
|
||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
dev_docs = list(corpus.dev_docs(nlp))
|
dev_dataset = list(corpus.dev_dataset(nlp))
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
loading_dev_error_message = "Development data cannot be loaded: {}".format(
|
loading_dev_error_message = "Development data cannot be loaded: {}".format(
|
||||||
str(e)
|
str(e)
|
||||||
|
@ -102,10 +102,10 @@ def debug_data(
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
msg.good("Corpus is loadable")
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_docs constantly
|
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||||
gold_train_data = _compile_gold(train_docs, pipeline)
|
gold_train_data = _compile_gold(train_dataset, pipeline)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline)
|
gold_train_unpreprocessed_data = _compile_gold(train_dataset_unpreprocessed, pipeline)
|
||||||
gold_dev_data = _compile_gold(dev_docs, pipeline)
|
gold_dev_data = _compile_gold(dev_dataset, pipeline)
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
dev_texts = gold_dev_data["texts"]
|
dev_texts = gold_dev_data["texts"]
|
||||||
|
@ -118,19 +118,19 @@ def debug_data(
|
||||||
msg.text("Starting with base model '{}'".format(base_model))
|
msg.text("Starting with base model '{}'".format(base_model))
|
||||||
else:
|
else:
|
||||||
msg.text("Starting with blank model '{}'".format(lang))
|
msg.text("Starting with blank model '{}'".format(lang))
|
||||||
msg.text("{} training docs".format(len(train_docs)))
|
msg.text("{} training docs".format(len(train_dataset)))
|
||||||
msg.text("{} evaluation docs".format(len(dev_docs)))
|
msg.text("{} evaluation docs".format(len(gold_dev_data)))
|
||||||
|
|
||||||
overlap = len(train_texts.intersection(dev_texts))
|
overlap = len(train_texts.intersection(dev_texts))
|
||||||
if overlap:
|
if overlap:
|
||||||
msg.warn("{} training examples also in evaluation data".format(overlap))
|
msg.warn("{} training examples also in evaluation data".format(overlap))
|
||||||
else:
|
else:
|
||||||
msg.good("No overlap between training and evaluation data")
|
msg.good("No overlap between training and evaluation data")
|
||||||
if not base_model and len(train_docs) < BLANK_MODEL_THRESHOLD:
|
if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD:
|
||||||
text = "Low number of examples to train from a blank model ({})".format(
|
text = "Low number of examples to train from a blank model ({})".format(
|
||||||
len(train_docs)
|
len(train_dataset)
|
||||||
)
|
)
|
||||||
if len(train_docs) < BLANK_MODEL_MIN_THRESHOLD:
|
if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD:
|
||||||
msg.fail(text)
|
msg.fail(text)
|
||||||
else:
|
else:
|
||||||
msg.warn(text)
|
msg.warn(text)
|
||||||
|
@ -238,7 +238,7 @@ def debug_data(
|
||||||
has_low_data_warning = True
|
has_low_data_warning = True
|
||||||
|
|
||||||
with msg.loading("Analyzing label distribution..."):
|
with msg.loading("Analyzing label distribution..."):
|
||||||
neg_docs = _get_examples_without_label(train_docs, label)
|
neg_docs = _get_examples_without_label(train_dataset, label)
|
||||||
if neg_docs == 0:
|
if neg_docs == 0:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"No examples for texts WITHOUT new label '{}'".format(label)
|
"No examples for texts WITHOUT new label '{}'".format(label)
|
||||||
|
@ -358,7 +358,7 @@ def debug_data(
|
||||||
msg.info(
|
msg.info(
|
||||||
"Found {} sentence{} with an average length of {:.1f} words.".format(
|
"Found {} sentence{} with an average length of {:.1f} words.".format(
|
||||||
gold_train_data["n_sents"],
|
gold_train_data["n_sents"],
|
||||||
"s" if len(train_docs) > 1 else "",
|
"s" if len(train_dataset) > 1 else "",
|
||||||
gold_train_data["n_words"] / gold_train_data["n_sents"],
|
gold_train_data["n_words"] / gold_train_data["n_sents"],
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -536,7 +536,7 @@ def _load_file(file_path, msg):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _compile_gold(train_docs, pipeline):
|
def _compile_gold(examples, pipeline):
|
||||||
data = {
|
data = {
|
||||||
"ner": Counter(),
|
"ner": Counter(),
|
||||||
"cats": Counter(),
|
"cats": Counter(),
|
||||||
|
@ -553,7 +553,9 @@ def _compile_gold(train_docs, pipeline):
|
||||||
"n_cats_multilabel": 0,
|
"n_cats_multilabel": 0,
|
||||||
"texts": set(),
|
"texts": set(),
|
||||||
}
|
}
|
||||||
for doc, gold in train_docs:
|
for example in examples:
|
||||||
|
gold = example.gold
|
||||||
|
doc = example.doc
|
||||||
valid_words = [x for x in gold.words if x is not None]
|
valid_words = [x for x in gold.words if x is not None]
|
||||||
data["words"].update(valid_words)
|
data["words"].update(valid_words)
|
||||||
data["n_words"] += len(valid_words)
|
data["n_words"] += len(valid_words)
|
||||||
|
@ -598,8 +600,8 @@ def _format_labels(labels, counts=False):
|
||||||
|
|
||||||
def _get_examples_without_label(data, label):
|
def _get_examples_without_label(data, label):
|
||||||
count = 0
|
count = 0
|
||||||
for doc, gold in data:
|
for ex in data:
|
||||||
labels = [label.split("-")[1] for label in gold.ner if label not in ("O", "-")]
|
labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-")]
|
||||||
if label not in labels:
|
if label not in labels:
|
||||||
count += 1
|
count += 1
|
||||||
return count
|
return count
|
||||||
|
|
|
@ -45,11 +45,11 @@ def evaluate(
|
||||||
msg.fail("Visualization output directory not found", displacy_path, exits=1)
|
msg.fail("Visualization output directory not found", displacy_path, exits=1)
|
||||||
corpus = GoldCorpus(data_path, data_path)
|
corpus = GoldCorpus(data_path, data_path)
|
||||||
nlp = util.load_model(model)
|
nlp = util.load_model(model)
|
||||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
dev_dataset = list(corpus.dev_dataset(nlp, gold_preproc=gold_preproc))
|
||||||
begin = timer()
|
begin = timer()
|
||||||
scorer = nlp.evaluate(dev_docs, verbose=False)
|
scorer = nlp.evaluate(dev_dataset, verbose=False)
|
||||||
end = timer()
|
end = timer()
|
||||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
nwords = sum(len(ex.doc) for ex in dev_dataset)
|
||||||
results = {
|
results = {
|
||||||
"Time": "%.2f s" % (end - begin),
|
"Time": "%.2f s" % (end - begin),
|
||||||
"Words": nwords,
|
"Words": nwords,
|
||||||
|
@ -66,7 +66,7 @@ def evaluate(
|
||||||
msg.table(results, title="Results")
|
msg.table(results, title="Results")
|
||||||
|
|
||||||
if displacy_path:
|
if displacy_path:
|
||||||
docs, golds = zip(*dev_docs)
|
docs = [ex.doc for ex in dev_dataset]
|
||||||
render_deps = "parser" in nlp.meta.get("pipeline", [])
|
render_deps = "parser" in nlp.meta.get("pipeline", [])
|
||||||
render_ents = "ner" in nlp.meta.get("pipeline", [])
|
render_ents = "ner" in nlp.meta.get("pipeline", [])
|
||||||
render_parses(
|
render_parses(
|
||||||
|
|
|
@ -14,6 +14,7 @@ from thinc.neural.util import prefer_gpu
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from spacy.gold import Example
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import ID, HEAD
|
from ..attrs import ID, HEAD
|
||||||
|
@ -221,7 +222,7 @@ def pretrain(
|
||||||
skip_counter = 0
|
skip_counter = 0
|
||||||
for epoch in range(epoch_start, n_iter + epoch_start):
|
for epoch in range(epoch_start, n_iter + epoch_start):
|
||||||
for batch_id, batch in enumerate(
|
for batch_id, batch in enumerate(
|
||||||
util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
|
util.minibatch_by_words((Example(doc=text) for text in texts), size=batch_size)
|
||||||
):
|
):
|
||||||
docs, count = make_docs(
|
docs, count = make_docs(
|
||||||
nlp,
|
nlp,
|
||||||
|
|
|
@ -236,7 +236,7 @@ def train(
|
||||||
optimizer = create_default_optimizer(Model.ops)
|
optimizer = create_default_optimizer(Model.ops)
|
||||||
else:
|
else:
|
||||||
# Start with a blank model, call begin_training
|
# Start with a blank model, call begin_training
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
optimizer = nlp.begin_training(lambda: corpus.train_examples, device=use_gpu)
|
||||||
|
|
||||||
nlp._optimizer = None
|
nlp._optimizer = None
|
||||||
|
|
||||||
|
@ -261,7 +261,7 @@ def train(
|
||||||
"problem with two labels.".format(textcat_positive_label),
|
"problem with two labels.".format(textcat_positive_label),
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
train_docs = corpus.train_docs(
|
train_data = corpus.train_data(
|
||||||
nlp,
|
nlp,
|
||||||
noise_level=noise_level,
|
noise_level=noise_level,
|
||||||
gold_preproc=gold_preproc,
|
gold_preproc=gold_preproc,
|
||||||
|
@ -271,9 +271,9 @@ def train(
|
||||||
train_labels = set()
|
train_labels = set()
|
||||||
if textcat_multilabel:
|
if textcat_multilabel:
|
||||||
multilabel_found = False
|
multilabel_found = False
|
||||||
for text, gold in train_docs:
|
for ex in train_data:
|
||||||
train_labels.update(gold.cats.keys())
|
train_labels.update(ex.gold.cats.keys())
|
||||||
if list(gold.cats.values()).count(1.0) != 1:
|
if list(ex.gold.cats.values()).count(1.0) != 1:
|
||||||
multilabel_found = True
|
multilabel_found = True
|
||||||
if not multilabel_found and not base_model:
|
if not multilabel_found and not base_model:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
|
@ -283,9 +283,9 @@ def train(
|
||||||
"mutually-exclusive classes."
|
"mutually-exclusive classes."
|
||||||
)
|
)
|
||||||
if not textcat_multilabel:
|
if not textcat_multilabel:
|
||||||
for text, gold in train_docs:
|
for ex in train_data:
|
||||||
train_labels.update(gold.cats.keys())
|
train_labels.update(ex.gold.cats.keys())
|
||||||
if list(gold.cats.values()).count(1.0) != 1 and not base_model:
|
if list(ex.gold.cats.values()).count(1.0) != 1 and not base_model:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
"Some textcat training instances do not have exactly "
|
"Some textcat training instances do not have exactly "
|
||||||
"one positive label. Modifying training options to "
|
"one positive label. Modifying training options to "
|
||||||
|
@ -341,7 +341,7 @@ def train(
|
||||||
iter_since_best = 0
|
iter_since_best = 0
|
||||||
best_score = 0.0
|
best_score = 0.0
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
train_docs = corpus.train_docs(
|
train_data = corpus.train_data(
|
||||||
nlp,
|
nlp,
|
||||||
noise_level=noise_level,
|
noise_level=noise_level,
|
||||||
orth_variant_level=orth_variant_level,
|
orth_variant_level=orth_variant_level,
|
||||||
|
@ -357,13 +357,11 @@ def train(
|
||||||
words_seen = 0
|
words_seen = 0
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
|
for batch in util.minibatch_by_words(train_data, size=batch_sizes):
|
||||||
if not batch:
|
if not batch:
|
||||||
continue
|
continue
|
||||||
docs, golds = zip(*batch)
|
|
||||||
nlp.update(
|
nlp.update(
|
||||||
docs,
|
batch,
|
||||||
golds,
|
|
||||||
sgd=optimizer,
|
sgd=optimizer,
|
||||||
drop=next(dropout_rates),
|
drop=next(dropout_rates),
|
||||||
losses=losses,
|
losses=losses,
|
||||||
|
@ -373,6 +371,7 @@ def train(
|
||||||
# which use unlabelled data to reduce overfitting.
|
# which use unlabelled data to reduce overfitting.
|
||||||
raw_batch = list(next(raw_batches))
|
raw_batch = list(next(raw_batches))
|
||||||
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
|
nlp.rehearse(raw_batch, sgd=optimizer, losses=losses)
|
||||||
|
docs = [ex.doc for ex in batch]
|
||||||
if not int(os.environ.get("LOG_FRIENDLY", 0)):
|
if not int(os.environ.get("LOG_FRIENDLY", 0)):
|
||||||
pbar.update(sum(len(doc) for doc in docs))
|
pbar.update(sum(len(doc) for doc in docs))
|
||||||
words_seen += sum(len(doc) for doc in docs)
|
words_seen += sum(len(doc) for doc in docs)
|
||||||
|
@ -385,16 +384,16 @@ def train(
|
||||||
for name, component in nlp_loaded.pipeline:
|
for name, component in nlp_loaded.pipeline:
|
||||||
if hasattr(component, "cfg"):
|
if hasattr(component, "cfg"):
|
||||||
component.cfg["beam_width"] = beam_width
|
component.cfg["beam_width"] = beam_width
|
||||||
dev_docs = list(
|
dev_dataset = list(
|
||||||
corpus.dev_docs(
|
corpus.dev_dataset(
|
||||||
nlp_loaded,
|
nlp_loaded,
|
||||||
gold_preproc=gold_preproc,
|
gold_preproc=gold_preproc,
|
||||||
ignore_misaligned=True,
|
ignore_misaligned=True,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
|
nwords = sum(len(ex.doc) for ex in dev_dataset)
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
|
scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
if use_gpu < 0:
|
if use_gpu < 0:
|
||||||
gpu_wps = None
|
gpu_wps = None
|
||||||
|
@ -406,15 +405,15 @@ def train(
|
||||||
for name, component in nlp_loaded.pipeline:
|
for name, component in nlp_loaded.pipeline:
|
||||||
if hasattr(component, "cfg"):
|
if hasattr(component, "cfg"):
|
||||||
component.cfg["beam_width"] = beam_width
|
component.cfg["beam_width"] = beam_width
|
||||||
dev_docs = list(
|
dev_dataset = list(
|
||||||
corpus.dev_docs(
|
corpus.dev_dataset(
|
||||||
nlp_loaded,
|
nlp_loaded,
|
||||||
gold_preproc=gold_preproc,
|
gold_preproc=gold_preproc,
|
||||||
ignore_misaligned=True,
|
ignore_misaligned=True,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
start_time = timer()
|
start_time = timer()
|
||||||
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
|
scorer = nlp_loaded.evaluate(dev_dataset, verbose=verbose)
|
||||||
end_time = timer()
|
end_time = timer()
|
||||||
cpu_wps = nwords / (end_time - start_time)
|
cpu_wps = nwords / (end_time - start_time)
|
||||||
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
acc_loc = output_path / ("model%d" % i) / "accuracy.json"
|
||||||
|
|
|
@ -530,6 +530,12 @@ class Errors(object):
|
||||||
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
|
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
|
||||||
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
|
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
|
||||||
|
|
||||||
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E998 = ("Can only create GoldParse's from Example's without a Doc, "
|
||||||
|
"if get_gold_parses() is called with a Vocab object.")
|
||||||
|
E999 = ("Encountered an unexpected format for the dictionary holding "
|
||||||
|
"gold annotations: {gold_dict}")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class TempErrors(object):
|
class TempErrors(object):
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .structs cimport TokenC
|
from spacy.tokens import Doc
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .syntax.transition_system cimport Transition
|
from .syntax.transition_system cimport Transition
|
||||||
|
|
||||||
|
@ -19,6 +19,7 @@ cdef class GoldParse:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
|
|
||||||
cdef GoldParseC c
|
cdef GoldParseC c
|
||||||
|
cdef readonly TokenAnnotation orig
|
||||||
|
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef public int loss
|
cdef public int loss
|
||||||
|
@ -29,13 +30,36 @@ cdef class GoldParse:
|
||||||
cdef public list labels
|
cdef public list labels
|
||||||
cdef public dict orths
|
cdef public dict orths
|
||||||
cdef public list ner
|
cdef public list ner
|
||||||
cdef public list ents
|
|
||||||
cdef public dict brackets
|
cdef public dict brackets
|
||||||
cdef public object cats
|
cdef public dict cats
|
||||||
cdef public dict links
|
cdef public dict links
|
||||||
|
|
||||||
cdef readonly list cand_to_gold
|
cdef readonly list cand_to_gold
|
||||||
cdef readonly list gold_to_cand
|
cdef readonly list gold_to_cand
|
||||||
cdef readonly list orig_annot
|
|
||||||
|
|
||||||
|
cdef class TokenAnnotation:
|
||||||
|
cdef public list ids
|
||||||
|
cdef public list words
|
||||||
|
cdef public list tags
|
||||||
|
cdef public list heads
|
||||||
|
cdef public list deps
|
||||||
|
cdef public list entities
|
||||||
|
cdef public list morphology
|
||||||
|
cdef public list brackets
|
||||||
|
|
||||||
|
|
||||||
|
cdef class DocAnnotation:
|
||||||
|
cdef public object cats
|
||||||
|
cdef public object links
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Example:
|
||||||
|
cdef public object doc
|
||||||
|
cdef public list token_annotations
|
||||||
|
cdef public DocAnnotation doc_annotation
|
||||||
|
cdef public object make_projective
|
||||||
|
cdef public object ignore_misaligned
|
||||||
|
cdef public object goldparse
|
||||||
|
|
||||||
|
|
||||||
|
|
519
spacy/gold.pyx
519
spacy/gold.pyx
|
@ -14,11 +14,8 @@ import srsly
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .tokens import Doc, Span
|
from .tokens import Doc, Span
|
||||||
from .errors import Errors, AlignmentError
|
from .errors import Errors, AlignmentError
|
||||||
from .compat import path2str
|
from .compat import path2str, basestring_
|
||||||
from . import util
|
from . import util
|
||||||
from .util import minibatch, itershuffle
|
|
||||||
|
|
||||||
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
|
|
||||||
|
|
||||||
|
|
||||||
USE_NEW_ALIGN = False
|
USE_NEW_ALIGN = False
|
||||||
|
@ -54,25 +51,6 @@ def tags_to_entities(tags):
|
||||||
return entities
|
return entities
|
||||||
|
|
||||||
|
|
||||||
def merge_sents(sents):
|
|
||||||
m_deps = [[], [], [], [], [], []]
|
|
||||||
m_cats = {}
|
|
||||||
m_brackets = []
|
|
||||||
i = 0
|
|
||||||
for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
|
|
||||||
m_deps[0].extend(id_ + i for id_ in ids)
|
|
||||||
m_deps[1].extend(words)
|
|
||||||
m_deps[2].extend(tags)
|
|
||||||
m_deps[3].extend(head + i for head in heads)
|
|
||||||
m_deps[4].extend(labels)
|
|
||||||
m_deps[5].extend(ner)
|
|
||||||
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
|
|
||||||
for b in brackets)
|
|
||||||
m_cats.update(cats)
|
|
||||||
i += len(ids)
|
|
||||||
return [(m_deps, (m_cats, m_brackets))]
|
|
||||||
|
|
||||||
|
|
||||||
_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
|
_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
|
||||||
|
|
||||||
|
|
||||||
|
@ -211,14 +189,14 @@ class GoldCorpus(object):
|
||||||
def __init__(self, train, dev, gold_preproc=False, limit=None):
|
def __init__(self, train, dev, gold_preproc=False, limit=None):
|
||||||
"""Create a GoldCorpus.
|
"""Create a GoldCorpus.
|
||||||
|
|
||||||
train_path (unicode or Path): File or directory of training data.
|
train (unicode or Path): File or directory of training data.
|
||||||
dev_path (unicode or Path): File or directory of development data.
|
dev (unicode or Path): File or directory of development data.
|
||||||
RETURNS (GoldCorpus): The newly created object.
|
RETURNS (GoldCorpus): The newly created object.
|
||||||
"""
|
"""
|
||||||
self.limit = limit
|
self.limit = limit
|
||||||
if isinstance(train, str) or isinstance(train, Path):
|
if isinstance(train, str) or isinstance(train, Path):
|
||||||
train = self.read_tuples(self.walk_corpus(train))
|
train = self.read_examples(self.walk_corpus(train))
|
||||||
dev = self.read_tuples(self.walk_corpus(dev))
|
dev = self.read_examples(self.walk_corpus(dev))
|
||||||
# Write temp directory with one doc per file, so we can shuffle and stream
|
# Write temp directory with one doc per file, so we can shuffle and stream
|
||||||
self.tmp_dir = Path(tempfile.mkdtemp())
|
self.tmp_dir = Path(tempfile.mkdtemp())
|
||||||
self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
|
self.write_msgpack(self.tmp_dir / "train", train, limit=self.limit)
|
||||||
|
@ -228,13 +206,15 @@ class GoldCorpus(object):
|
||||||
shutil.rmtree(path2str(self.tmp_dir))
|
shutil.rmtree(path2str(self.tmp_dir))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_msgpack(directory, doc_tuples, limit=0):
|
def write_msgpack(directory, examples, limit=0):
|
||||||
if not directory.exists():
|
if not directory.exists():
|
||||||
directory.mkdir()
|
directory.mkdir()
|
||||||
n = 0
|
n = 0
|
||||||
for i, doc_tuple in enumerate(doc_tuples):
|
for i, example in enumerate(examples):
|
||||||
srsly.write_msgpack(directory / "{}.msg".format(i), [doc_tuple])
|
ex_dict = example.to_dict()
|
||||||
n += len(doc_tuple[1])
|
text = example.text
|
||||||
|
srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict))
|
||||||
|
n += len(example.token_annotations)
|
||||||
if limit and n >= limit:
|
if limit and n >= limit:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
@ -259,128 +239,144 @@ class GoldCorpus(object):
|
||||||
return locs
|
return locs
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_tuples(locs, limit=0):
|
def read_examples(locs, limit=0):
|
||||||
|
""" Yield training examples """
|
||||||
i = 0
|
i = 0
|
||||||
for loc in locs:
|
for loc in locs:
|
||||||
loc = util.ensure_path(loc)
|
loc = util.ensure_path(loc)
|
||||||
if loc.parts[-1].endswith("json"):
|
if loc.parts[-1].endswith("json"):
|
||||||
gold_tuples = read_json_file(loc)
|
examples = read_json_file(loc)
|
||||||
elif loc.parts[-1].endswith("jsonl"):
|
elif loc.parts[-1].endswith("jsonl"):
|
||||||
gold_tuples = srsly.read_jsonl(loc)
|
gold_tuples = srsly.read_jsonl(loc)
|
||||||
first_gold_tuple = next(gold_tuples)
|
first_gold_tuple = next(gold_tuples)
|
||||||
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
|
gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
|
||||||
# TODO: proper format checks with schemas
|
# TODO: proper format checks with schemas
|
||||||
if isinstance(first_gold_tuple, dict):
|
if isinstance(first_gold_tuple, dict):
|
||||||
gold_tuples = read_json_object(gold_tuples)
|
if first_gold_tuple.get("paragraphs", None):
|
||||||
|
examples = read_json_object(gold_tuples)
|
||||||
|
elif first_gold_tuple.get("doc_annotation", None):
|
||||||
|
examples = []
|
||||||
|
for ex_dict in gold_tuples:
|
||||||
|
doc = ex_dict.get("doc", None)
|
||||||
|
if doc is None:
|
||||||
|
doc = ex_dict.get("text", None)
|
||||||
|
examples.append(Example.from_dict(ex_dict, doc=doc))
|
||||||
|
|
||||||
elif loc.parts[-1].endswith("msg"):
|
elif loc.parts[-1].endswith("msg"):
|
||||||
gold_tuples = srsly.read_msgpack(loc)
|
text, ex_dict = srsly.read_msgpack(loc)
|
||||||
|
examples = [Example.from_dict(ex_dict, doc=text)]
|
||||||
else:
|
else:
|
||||||
supported = ("json", "jsonl", "msg")
|
supported = ("json", "jsonl", "msg")
|
||||||
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
|
raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
|
||||||
for item in gold_tuples:
|
for example in examples:
|
||||||
yield item
|
yield example
|
||||||
i += len(item[1])
|
i += len(example.token_annotations)
|
||||||
if limit and i >= limit:
|
if limit and i >= limit:
|
||||||
return
|
return
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dev_tuples(self):
|
def dev_examples(self):
|
||||||
locs = (self.tmp_dir / "dev").iterdir()
|
locs = (self.tmp_dir / "dev").iterdir()
|
||||||
yield from self.read_tuples(locs, limit=self.limit)
|
yield from self.read_examples(locs, limit=self.limit)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def train_tuples(self):
|
def train_examples(self):
|
||||||
locs = (self.tmp_dir / "train").iterdir()
|
locs = (self.tmp_dir / "train").iterdir()
|
||||||
yield from self.read_tuples(locs, limit=self.limit)
|
yield from self.read_examples(locs, limit=self.limit)
|
||||||
|
|
||||||
def count_train(self):
|
def count_train(self):
|
||||||
|
# TODO: should this count words or sentences ?
|
||||||
n = 0
|
n = 0
|
||||||
i = 0
|
i = 0
|
||||||
for raw_text, paragraph_tuples in self.train_tuples:
|
for example in self.train_examples:
|
||||||
for sent_tuples, brackets in paragraph_tuples:
|
for token_annotation in example.token_annotations:
|
||||||
n += len(sent_tuples[1])
|
n += len(token_annotation.words)
|
||||||
if self.limit and i >= self.limit:
|
if self.limit and i >= self.limit:
|
||||||
break
|
break
|
||||||
i += 1
|
i += 1
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def train_docs(self, nlp, gold_preproc=False, max_length=None,
|
def train_dataset(self, nlp, gold_preproc=False, max_length=None,
|
||||||
noise_level=0.0, orth_variant_level=0.0,
|
noise_level=0.0, orth_variant_level=0.0,
|
||||||
ignore_misaligned=False):
|
ignore_misaligned=False):
|
||||||
locs = list((self.tmp_dir / 'train').iterdir())
|
locs = list((self.tmp_dir / 'train').iterdir())
|
||||||
random.shuffle(locs)
|
random.shuffle(locs)
|
||||||
train_tuples = self.read_tuples(locs, limit=self.limit)
|
train_examples = self.read_examples(locs, limit=self.limit)
|
||||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
gold_examples = self.iter_gold_docs(nlp, train_examples, gold_preproc,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
noise_level=noise_level,
|
noise_level=noise_level,
|
||||||
orth_variant_level=orth_variant_level,
|
orth_variant_level=orth_variant_level,
|
||||||
make_projective=True,
|
make_projective=True,
|
||||||
ignore_misaligned=ignore_misaligned)
|
ignore_misaligned=ignore_misaligned)
|
||||||
yield from gold_docs
|
yield from gold_examples
|
||||||
|
|
||||||
def train_docs_without_preprocessing(self, nlp, gold_preproc=False):
|
def train_dataset_without_preprocessing(self, nlp, gold_preproc=False):
|
||||||
gold_docs = self.iter_gold_docs(nlp, self.train_tuples, gold_preproc=gold_preproc)
|
examples = self.iter_gold_docs(nlp, self.train_examples, gold_preproc=gold_preproc)
|
||||||
yield from gold_docs
|
yield from examples
|
||||||
|
|
||||||
def dev_docs(self, nlp, gold_preproc=False, ignore_misaligned=False):
|
def dev_dataset(self, nlp, gold_preproc=False, ignore_misaligned=False):
|
||||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc,
|
examples = self.iter_gold_docs(nlp, self.dev_examples, gold_preproc=gold_preproc,
|
||||||
ignore_misaligned=ignore_misaligned)
|
ignore_misaligned=ignore_misaligned)
|
||||||
yield from gold_docs
|
yield from examples
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
|
def iter_gold_docs(cls, nlp, examples, gold_preproc, max_length=None,
|
||||||
noise_level=0.0, orth_variant_level=0.0, make_projective=False,
|
noise_level=0.0, orth_variant_level=0.0, make_projective=False,
|
||||||
ignore_misaligned=False):
|
ignore_misaligned=False):
|
||||||
for raw_text, paragraph_tuples in tuples:
|
""" Setting gold_preproc will result in creating a doc per 'sentence' """
|
||||||
|
for example in examples:
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
raw_text = None
|
example.doc = None
|
||||||
else:
|
else:
|
||||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
example = example.merge_sents()
|
||||||
docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
|
example.make_projective = make_projective
|
||||||
paragraph_tuples, gold_preproc, noise_level=noise_level,
|
example.ignore_misaligned = ignore_misaligned
|
||||||
|
examples = cls._make_docs(nlp, example,
|
||||||
|
gold_preproc, noise_level=noise_level,
|
||||||
orth_variant_level=orth_variant_level)
|
orth_variant_level=orth_variant_level)
|
||||||
golds = cls._make_golds(docs, paragraph_tuples, make_projective,
|
examples = cls._make_golds(examples, vocab=nlp.vocab)
|
||||||
ignore_misaligned=ignore_misaligned)
|
for ex in examples:
|
||||||
for doc, gold in zip(docs, golds):
|
if ex.gold is not None:
|
||||||
if gold is not None:
|
if (not max_length) or len(ex.doc) < max_length:
|
||||||
if (not max_length) or len(doc) < max_length:
|
yield ex
|
||||||
yield doc, gold
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
|
def _make_docs(cls, nlp, example, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
|
||||||
if raw_text is not None:
|
# gold_preproc is not used ?!
|
||||||
raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level)
|
if example.text is not None:
|
||||||
raw_text = add_noise(raw_text, noise_level)
|
var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
|
||||||
return [nlp.make_doc(raw_text)], paragraph_tuples
|
var_text = add_noise(var_example.text, noise_level)
|
||||||
|
var_doc = nlp.make_doc(var_text)
|
||||||
|
var_example.doc = var_doc
|
||||||
|
return [var_example]
|
||||||
else:
|
else:
|
||||||
docs = []
|
var_example = make_orth_variants(nlp, example, orth_variant_level=orth_variant_level)
|
||||||
raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level)
|
doc_examples = []
|
||||||
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
|
for token_annotation in var_example.token_annotations:
|
||||||
for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
|
t_doc = Doc(nlp.vocab, words=add_noise(token_annotation.words, noise_level))
|
||||||
|
doc_example = Example(doc_annotation=example.doc_annotation,
|
||||||
|
token_annotations=[token_annotation],
|
||||||
|
doc=t_doc)
|
||||||
|
doc_examples.append(doc_example)
|
||||||
|
return doc_examples
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_golds(cls, docs, paragraph_tuples, make_projective, ignore_misaligned=False):
|
def _make_golds(cls, examples, vocab=None):
|
||||||
if len(docs) != len(paragraph_tuples):
|
gold_examples = []
|
||||||
n_annots = len(paragraph_tuples)
|
for example in examples:
|
||||||
raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots))
|
gold_parses = example.get_gold_parses(vocab=vocab)
|
||||||
golds = []
|
for (doc, gold) in gold_parses:
|
||||||
for doc, (sent_tuples, (cats, brackets)) in zip(docs, paragraph_tuples):
|
ex = Example(doc=doc)
|
||||||
try:
|
ex.goldparse = gold
|
||||||
gold = GoldParse.from_annot_tuples(doc, sent_tuples, cats=cats,
|
gold_examples.append(ex)
|
||||||
make_projective=make_projective)
|
return gold_examples
|
||||||
except AlignmentError:
|
|
||||||
if ignore_misaligned:
|
|
||||||
gold = None
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
golds.append(gold)
|
|
||||||
return golds
|
|
||||||
|
|
||||||
|
def make_orth_variants(nlp, example, orth_variant_level=0.0):
|
||||||
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
|
||||||
if random.random() >= orth_variant_level:
|
if random.random() >= orth_variant_level:
|
||||||
return raw, paragraph_tuples
|
return example
|
||||||
|
if not example.token_annotations:
|
||||||
|
return example
|
||||||
|
raw = example.text
|
||||||
if random.random() >= 0.5:
|
if random.random() >= 0.5:
|
||||||
lower = True
|
lower = True
|
||||||
if raw is not None:
|
if raw is not None:
|
||||||
|
@ -388,9 +384,15 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
ndsv = nlp.Defaults.single_orth_variants
|
ndsv = nlp.Defaults.single_orth_variants
|
||||||
ndpv = nlp.Defaults.paired_orth_variants
|
ndpv = nlp.Defaults.paired_orth_variants
|
||||||
# modify words in paragraph_tuples
|
# modify words in paragraph_tuples
|
||||||
variant_paragraph_tuples = []
|
variant_example = Example(doc=raw)
|
||||||
for sent_tuples, brackets in paragraph_tuples:
|
for token_annotation in example.token_annotations:
|
||||||
ids, words, tags, heads, labels, ner = sent_tuples
|
words = token_annotation.words
|
||||||
|
tags = token_annotation.tags
|
||||||
|
if not words or not tags:
|
||||||
|
# add the unmodified annotation
|
||||||
|
token_dict = token_annotation.to_dict()
|
||||||
|
variant_example.add_token_annotation(**token_dict)
|
||||||
|
else:
|
||||||
if lower:
|
if lower:
|
||||||
words = [w.lower() for w in words]
|
words = [w.lower() for w in words]
|
||||||
# single variants
|
# single variants
|
||||||
|
@ -419,7 +421,10 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
pair_idx = pair.index(words[word_idx])
|
pair_idx = pair.index(words[word_idx])
|
||||||
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
words[word_idx] = punct_choices[punct_idx][pair_idx]
|
||||||
|
|
||||||
variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner), brackets))
|
token_dict = token_annotation.to_dict()
|
||||||
|
token_dict["words"] = words
|
||||||
|
token_dict["tags"] = tags
|
||||||
|
variant_example.add_token_annotation(**token_dict)
|
||||||
# modify raw to match variant_paragraph_tuples
|
# modify raw to match variant_paragraph_tuples
|
||||||
if raw is not None:
|
if raw is not None:
|
||||||
variants = []
|
variants = []
|
||||||
|
@ -437,9 +442,8 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||||
variant_raw += raw[raw_idx]
|
variant_raw += raw[raw_idx]
|
||||||
raw_idx += 1
|
raw_idx += 1
|
||||||
for sent_tuples, brackets in variant_paragraph_tuples:
|
for token_annotation in variant_example.token_annotations:
|
||||||
ids, words, tags, heads, labels, ner = sent_tuples
|
for word in token_annotation.words:
|
||||||
for word in words:
|
|
||||||
match_found = False
|
match_found = False
|
||||||
# add identical word
|
# add identical word
|
||||||
if word not in variants and raw[raw_idx:].startswith(word):
|
if word not in variants and raw[raw_idx:].startswith(word):
|
||||||
|
@ -457,13 +461,14 @@ def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
|
||||||
# something went wrong, abort
|
# something went wrong, abort
|
||||||
# (add a warning message?)
|
# (add a warning message?)
|
||||||
if not match_found:
|
if not match_found:
|
||||||
return raw, paragraph_tuples
|
return example
|
||||||
# add following whitespace
|
# add following whitespace
|
||||||
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
|
||||||
variant_raw += raw[raw_idx]
|
variant_raw += raw[raw_idx]
|
||||||
raw_idx += 1
|
raw_idx += 1
|
||||||
return variant_raw, variant_paragraph_tuples
|
variant_example.doc = variant_raw
|
||||||
return raw, variant_paragraph_tuples
|
return variant_example
|
||||||
|
return variant_example
|
||||||
|
|
||||||
|
|
||||||
def add_noise(orig, noise_level):
|
def add_noise(orig, noise_level):
|
||||||
|
@ -488,30 +493,27 @@ def _corrupt(c, noise_level):
|
||||||
|
|
||||||
def read_json_object(json_corpus_section):
|
def read_json_object(json_corpus_section):
|
||||||
"""Take a list of JSON-formatted documents (e.g. from an already loaded
|
"""Take a list of JSON-formatted documents (e.g. from an already loaded
|
||||||
training data file) and yield tuples in the GoldParse format.
|
training data file) and yield annotations in the GoldParse format.
|
||||||
|
|
||||||
json_corpus_section (list): The data.
|
json_corpus_section (list): The data.
|
||||||
YIELDS (tuple): The reformatted data.
|
YIELDS (Example): The reformatted data - one training example per paragraph
|
||||||
"""
|
"""
|
||||||
for json_doc in json_corpus_section:
|
for json_doc in json_corpus_section:
|
||||||
tuple_doc = json_to_tuple(json_doc)
|
examples = json_to_examples(json_doc)
|
||||||
for tuple_paragraph in tuple_doc:
|
for ex in examples:
|
||||||
yield tuple_paragraph
|
yield ex
|
||||||
|
|
||||||
|
|
||||||
def json_to_tuple(doc):
|
def json_to_examples(doc):
|
||||||
"""Convert an item in the JSON-formatted training data to the tuple format
|
"""Convert an item in the JSON-formatted training data to the format
|
||||||
used by GoldParse.
|
used by GoldParse.
|
||||||
|
|
||||||
doc (dict): One entry in the training data.
|
doc (dict): One entry in the training data.
|
||||||
YIELDS (tuple): The reformatted data.
|
YIELDS (Example): The reformatted data - one training example per paragraph
|
||||||
"""
|
"""
|
||||||
paragraphs = []
|
paragraphs = []
|
||||||
for paragraph in doc["paragraphs"]:
|
for paragraph in doc["paragraphs"]:
|
||||||
sents = []
|
example = Example(doc=paragraph.get("raw", None))
|
||||||
cats = {}
|
|
||||||
for cat in paragraph.get("cats", {}):
|
|
||||||
cats[cat["label"]] = cat["value"]
|
|
||||||
for sent in paragraph["sentences"]:
|
for sent in paragraph["sentences"]:
|
||||||
words = []
|
words = []
|
||||||
ids = []
|
ids = []
|
||||||
|
@ -529,11 +531,14 @@ def json_to_tuple(doc):
|
||||||
if labels[-1].lower() == "root":
|
if labels[-1].lower() == "root":
|
||||||
labels[-1] = "ROOT"
|
labels[-1] = "ROOT"
|
||||||
ner.append(token.get("ner", "-"))
|
ner.append(token.get("ner", "-"))
|
||||||
sents.append([
|
example.add_token_annotation(ids=ids, words=words, tags=tags,
|
||||||
[ids, words, tags, heads, labels, ner],
|
heads=heads, deps=labels, entities=ner,
|
||||||
[cats, sent.get("brackets", [])]])
|
brackets=sent.get("brackets", []))
|
||||||
if sents:
|
cats = {}
|
||||||
yield [paragraph.get("raw", None), sents]
|
for cat in paragraph.get("cats", {}):
|
||||||
|
cats[cat["label"]] = cat["value"]
|
||||||
|
example.add_doc_annotation(cats=cats)
|
||||||
|
yield example
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc, docs_filter=None, limit=None):
|
def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
|
@ -545,8 +550,8 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
for doc in _json_iterate(loc):
|
for doc in _json_iterate(loc):
|
||||||
if docs_filter is not None and not docs_filter(doc):
|
if docs_filter is not None and not docs_filter(doc):
|
||||||
continue
|
continue
|
||||||
for json_tuple in json_to_tuple(doc):
|
for json_data in json_to_examples(doc):
|
||||||
yield json_tuple
|
yield json_data
|
||||||
|
|
||||||
|
|
||||||
def _json_iterate(loc):
|
def _json_iterate(loc):
|
||||||
|
@ -639,21 +644,254 @@ def _consume_ent(tags):
|
||||||
return [start] + middle + [end]
|
return [start] + middle + [end]
|
||||||
|
|
||||||
|
|
||||||
|
cdef class TokenAnnotation:
|
||||||
|
def __init__(self, ids=None, words=None, tags=None, heads=None, deps=None, entities=None, morphology=None, brackets=None):
|
||||||
|
self.ids = ids if ids else []
|
||||||
|
self.words = words if words else []
|
||||||
|
self.tags = tags if tags else []
|
||||||
|
self.heads = heads if heads else []
|
||||||
|
self.deps = deps if deps else []
|
||||||
|
self.entities = entities if entities else []
|
||||||
|
self.brackets = brackets if brackets else []
|
||||||
|
self.morphology = morphology if morphology else []
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, token_dict):
|
||||||
|
return cls(ids=token_dict.get("ids", None),
|
||||||
|
words=token_dict.get("words", None),
|
||||||
|
tags=token_dict.get("tags", None),
|
||||||
|
heads=token_dict.get("heads", None),
|
||||||
|
deps=token_dict.get("deps", None),
|
||||||
|
entities=token_dict.get("entities", None),
|
||||||
|
morphology=token_dict.get("morphology", None),
|
||||||
|
brackets=token_dict.get("brackets", None))
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {"ids": self.ids,
|
||||||
|
"words": self.words,
|
||||||
|
"tags": self.tags,
|
||||||
|
"heads": self.heads,
|
||||||
|
"deps": self.deps,
|
||||||
|
"entities": self.entities,
|
||||||
|
"morphology": self.morphology,
|
||||||
|
"brackets": self.brackets}
|
||||||
|
|
||||||
|
|
||||||
|
cdef class DocAnnotation:
|
||||||
|
def __init__(self, cats=None, links=None):
|
||||||
|
self.cats = cats if cats else {}
|
||||||
|
self.links = links if links else {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, doc_dict):
|
||||||
|
return cls(cats=doc_dict.get("cats", None), links=doc_dict.get("links", None))
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {"cats": self.cats, "links": self.links}
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Example:
|
||||||
|
def __init__(self, doc_annotation=None, token_annotations=None, doc=None,
|
||||||
|
make_projective=False, ignore_misaligned=False, goldparse=None):
|
||||||
|
""" Doc can either be text, or an actual Doc """
|
||||||
|
self.doc = doc
|
||||||
|
self.doc_annotation = doc_annotation if doc_annotation else DocAnnotation()
|
||||||
|
self.token_annotations = token_annotations if token_annotations else []
|
||||||
|
self.make_projective = make_projective
|
||||||
|
self.ignore_misaligned = ignore_misaligned
|
||||||
|
self.goldparse = goldparse
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_gold(cls, goldparse, doc=None):
|
||||||
|
doc_annotation = DocAnnotation(cats=goldparse.cats, links=goldparse.links)
|
||||||
|
token_annotation = goldparse.get_token_annotation()
|
||||||
|
return cls(doc_annotation, [token_annotation], doc)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, example_dict, doc=None):
|
||||||
|
token_dicts = example_dict["token_annotations"]
|
||||||
|
token_annotations = [TokenAnnotation.from_dict(t) for t in token_dicts]
|
||||||
|
doc_dict = example_dict["doc_annotation"]
|
||||||
|
doc_annotation = DocAnnotation.from_dict(doc_dict)
|
||||||
|
return cls(doc_annotation, token_annotations, doc)
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
""" Note that this method does NOT export the doc, only the annotations ! """
|
||||||
|
token_dicts = [t.to_dict() for t in self.token_annotations]
|
||||||
|
doc_dict = self.doc_annotation.to_dict()
|
||||||
|
return {"token_annotations": token_dicts, "doc_annotation": doc_dict}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def text(self):
|
||||||
|
if self.doc is None:
|
||||||
|
return None
|
||||||
|
if isinstance(self.doc, Doc):
|
||||||
|
return self.doc.text
|
||||||
|
return self.doc
|
||||||
|
|
||||||
|
@property
|
||||||
|
def gold(self):
|
||||||
|
if self.goldparse is None:
|
||||||
|
doc, gold = self.get_gold_parses(merge=True)[0]
|
||||||
|
self.goldparse = gold
|
||||||
|
return self.goldparse
|
||||||
|
|
||||||
|
def add_token_annotation(self, ids=None, words=None, tags=None, heads=None,
|
||||||
|
deps=None, entities=None, morphology=None, brackets=None):
|
||||||
|
t = TokenAnnotation(ids=ids, words=words, tags=tags,
|
||||||
|
heads=heads, deps=deps, entities=entities,
|
||||||
|
morphology=morphology, brackets=brackets)
|
||||||
|
self.token_annotations.append(t)
|
||||||
|
|
||||||
|
def add_doc_annotation(self, cats=None, links=None):
|
||||||
|
if cats:
|
||||||
|
self.doc_annotation.cats.update(cats)
|
||||||
|
if links:
|
||||||
|
self.doc_annotation.links.update(links)
|
||||||
|
|
||||||
|
def merge_sents(self):
|
||||||
|
""" Merge the list of token annotations into one object and return this new object """
|
||||||
|
m_example = Example(doc=self.doc, doc_annotation=self.doc_annotation)
|
||||||
|
m_ids, m_words, m_tags, m_heads, m_deps, m_ents, m_morph = [], [], [], [], [], [], []
|
||||||
|
m_brackets = []
|
||||||
|
i = 0
|
||||||
|
for t in self.token_annotations:
|
||||||
|
m_ids.extend(id_ + i for id_ in t.ids)
|
||||||
|
m_words.extend(t.words)
|
||||||
|
m_tags.extend(t.tags)
|
||||||
|
m_heads.extend(head + i if head else None for head in t.heads)
|
||||||
|
m_deps.extend(t.deps)
|
||||||
|
m_ents.extend(t.entities)
|
||||||
|
m_morph.extend(t.morphology)
|
||||||
|
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
|
||||||
|
for b in t.brackets)
|
||||||
|
i += len(t.ids)
|
||||||
|
m_example.add_token_annotation(ids=m_ids, words=m_words, tags=m_tags,
|
||||||
|
heads=m_heads, deps=m_deps, entities=m_ents,
|
||||||
|
morphology=m_morph, brackets=m_brackets)
|
||||||
|
return m_example
|
||||||
|
|
||||||
|
|
||||||
|
def get_gold_parses(self, merge=False, vocab=None):
|
||||||
|
"""Return a list of (doc, GoldParse) objects.
|
||||||
|
If merge is set to True, add all Token annotations to one big list."""
|
||||||
|
d = self.doc_annotation
|
||||||
|
# merging different sentences
|
||||||
|
if merge:
|
||||||
|
merged_example = self.merge_sents()
|
||||||
|
assert(len(merged_example.token_annotations)) == 1
|
||||||
|
t = merged_example.token_annotations[0]
|
||||||
|
m_doc = merged_example.doc
|
||||||
|
if not m_doc:
|
||||||
|
if not vocab:
|
||||||
|
raise ValueError(Errors.E998)
|
||||||
|
m_doc = Doc(vocab, words=t.words)
|
||||||
|
try:
|
||||||
|
gp = GoldParse.from_annotation(m_doc, d, t, make_projective=self.make_projective)
|
||||||
|
except AlignmentError:
|
||||||
|
if self.ignore_misaligned:
|
||||||
|
gp = None
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
return [(self.doc, gp)]
|
||||||
|
# we only have one sentence and an appropriate doc
|
||||||
|
elif len(self.token_annotations) == 1 and self.doc is not None:
|
||||||
|
t = self.token_annotations[0]
|
||||||
|
try:
|
||||||
|
gp = GoldParse.from_annotation(self.doc, d, t, make_projective=self.make_projective)
|
||||||
|
except AlignmentError:
|
||||||
|
if self.ignore_misaligned:
|
||||||
|
gp = None
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
return [(self.doc, gp)]
|
||||||
|
# not merging: one GoldParse per 'sentence', defining docs with the words from each sentence
|
||||||
|
else:
|
||||||
|
parses = []
|
||||||
|
for t in self.token_annotations:
|
||||||
|
if not vocab:
|
||||||
|
raise ValueError(Errors.E998)
|
||||||
|
t_doc = Doc(vocab, words=t.words)
|
||||||
|
try:
|
||||||
|
gp = GoldParse.from_annotation(t_doc, d, t, make_projective=self.make_projective)
|
||||||
|
except AlignmentError:
|
||||||
|
if self.ignore_misaligned:
|
||||||
|
gp = None
|
||||||
|
else:
|
||||||
|
raise
|
||||||
|
if gp is not None:
|
||||||
|
parses.append((t_doc, gp))
|
||||||
|
return parses
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def to_example_objects(cls, examples, make_doc=None, keep_raw_text=False):
|
||||||
|
"""
|
||||||
|
Return a list of Example objects, from a variety of input formats.
|
||||||
|
make_doc needs to be provided when the examples contain text strings and keep_raw_text=False
|
||||||
|
"""
|
||||||
|
if isinstance(examples, Example):
|
||||||
|
return [examples]
|
||||||
|
if isinstance(examples, tuple):
|
||||||
|
examples = [examples]
|
||||||
|
converted_examples = []
|
||||||
|
for ex in examples:
|
||||||
|
# convert string to Doc to Example
|
||||||
|
if isinstance(ex, basestring_):
|
||||||
|
if keep_raw_text:
|
||||||
|
converted_examples.append(Example(doc=ex))
|
||||||
|
else:
|
||||||
|
doc = make_doc(ex)
|
||||||
|
converted_examples.append(Example(doc=doc))
|
||||||
|
# convert Doc to Example
|
||||||
|
elif isinstance(ex, Doc):
|
||||||
|
converted_examples.append(Example(doc=ex))
|
||||||
|
# convert tuples to Example
|
||||||
|
elif isinstance(ex, tuple) and len(ex) == 2:
|
||||||
|
doc, gold = ex
|
||||||
|
gold_dict = {}
|
||||||
|
# convert string to Doc
|
||||||
|
if isinstance(doc, basestring_) and not keep_raw_text:
|
||||||
|
doc = make_doc(doc)
|
||||||
|
# convert dict to GoldParse
|
||||||
|
if isinstance(gold, dict):
|
||||||
|
gold_dict = gold
|
||||||
|
if doc is not None or gold.get("words", None) is not None:
|
||||||
|
gold = GoldParse(doc, **gold)
|
||||||
|
else:
|
||||||
|
gold = None
|
||||||
|
if gold is not None:
|
||||||
|
converted_examples.append(Example.from_gold(goldparse=gold, doc=doc))
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E999.format(gold_dict=gold_dict))
|
||||||
|
else:
|
||||||
|
converted_examples.append(ex)
|
||||||
|
return converted_examples
|
||||||
|
|
||||||
|
|
||||||
cdef class GoldParse:
|
cdef class GoldParse:
|
||||||
"""Collection for training annotations.
|
"""Collection for training annotations.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/goldparse
|
DOCS: https://spacy.io/api/goldparse
|
||||||
"""
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_annot_tuples(cls, doc, annot_tuples, cats=None, make_projective=False):
|
def from_annotation(cls, doc, doc_annotation, token_annotation, make_projective=False):
|
||||||
_, words, tags, heads, deps, entities = annot_tuples
|
return cls(doc, words=token_annotation.words, tags=token_annotation.tags,
|
||||||
return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
|
heads=token_annotation.heads, deps=token_annotation.deps, entities=token_annotation.entities,
|
||||||
entities=entities, cats=cats,
|
morphology=token_annotation.morphology, cats=doc_annotation.cats, links=doc_annotation.links,
|
||||||
make_projective=make_projective)
|
make_projective=make_projective)
|
||||||
|
|
||||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
|
def get_token_annotation(self):
|
||||||
|
ids = None
|
||||||
|
if self.words:
|
||||||
|
ids = list(range(len(self.words)))
|
||||||
|
|
||||||
|
return TokenAnnotation(ids=ids, words=self.words, tags=self.tags,
|
||||||
|
heads=self.heads, deps=self.labels, entities=self.ner,
|
||||||
|
morphology=self.morphology)
|
||||||
|
|
||||||
|
def __init__(self, doc, words=None, tags=None, morphology=None,
|
||||||
heads=None, deps=None, entities=None, make_projective=False,
|
heads=None, deps=None, entities=None, make_projective=False,
|
||||||
cats=None, links=None, **_):
|
cats=None, links=None):
|
||||||
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
|
"""Create a GoldParse. The fields will not be initialized if len(doc) is zero.
|
||||||
|
|
||||||
doc (Doc): The document the annotations refer to.
|
doc (Doc): The document the annotations refer to.
|
||||||
|
@ -688,19 +926,19 @@ cdef class GoldParse:
|
||||||
self.length = len(doc)
|
self.length = len(doc)
|
||||||
|
|
||||||
self.cats = {} if cats is None else dict(cats)
|
self.cats = {} if cats is None else dict(cats)
|
||||||
self.links = links
|
self.links = {} if links is None else dict(links)
|
||||||
|
|
||||||
# avoid allocating memory if the doc does not contain any tokens
|
# avoid allocating memory if the doc does not contain any tokens
|
||||||
if self.length > 0:
|
if self.length > 0:
|
||||||
if words is None:
|
if not words:
|
||||||
words = [token.text for token in doc]
|
words = [token.text for token in doc]
|
||||||
if tags is None:
|
if not tags:
|
||||||
tags = [None for _ in words]
|
tags = [None for _ in words]
|
||||||
if heads is None:
|
if not heads:
|
||||||
heads = [None for _ in words]
|
heads = [None for _ in words]
|
||||||
if deps is None:
|
if not deps:
|
||||||
deps = [None for _ in words]
|
deps = [None for _ in words]
|
||||||
if morphology is None:
|
if not morphology:
|
||||||
morphology = [None for _ in words]
|
morphology = [None for _ in words]
|
||||||
if entities is None:
|
if entities is None:
|
||||||
entities = ["-" for _ in words]
|
entities = ["-" for _ in words]
|
||||||
|
@ -710,7 +948,7 @@ cdef class GoldParse:
|
||||||
# Translate the None values to '-', to make processing easier.
|
# Translate the None values to '-', to make processing easier.
|
||||||
# See Issue #2603
|
# See Issue #2603
|
||||||
entities = [(ent if ent is not None else "-") for ent in entities]
|
entities = [(ent if ent is not None else "-") for ent in entities]
|
||||||
if not isinstance(entities[0], basestring):
|
if not isinstance(entities[0], basestring_):
|
||||||
# Assume we have entities specified by character offset.
|
# Assume we have entities specified by character offset.
|
||||||
entities = biluo_tags_from_offsets(doc, entities)
|
entities = biluo_tags_from_offsets(doc, entities)
|
||||||
|
|
||||||
|
@ -745,8 +983,9 @@ cdef class GoldParse:
|
||||||
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
|
||||||
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
|
||||||
|
|
||||||
annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
|
self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags,
|
||||||
self.orig_annot = list(zip(*annot_tuples))
|
heads=heads, deps=deps, entities=entities, morphology=morphology,
|
||||||
|
brackets=[])
|
||||||
|
|
||||||
for i, gold_i in enumerate(self.cand_to_gold):
|
for i, gold_i in enumerate(self.cand_to_gold):
|
||||||
if doc[i].text.isspace():
|
if doc[i].text.isspace():
|
||||||
|
|
|
@ -3,6 +3,8 @@ from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
import random
|
import random
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
from spacy.gold import Example
|
||||||
from spacy.util import minibatch
|
from spacy.util import minibatch
|
||||||
import weakref
|
import weakref
|
||||||
import functools
|
import functools
|
||||||
|
@ -409,7 +411,7 @@ class Language(object):
|
||||||
|
|
||||||
def __call__(self, text, disable=[], component_cfg=None):
|
def __call__(self, text, disable=[], component_cfg=None):
|
||||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||||
and can contain arbtrary whitespace. Alignment into the original string
|
and can contain arbitrary whitespace. Alignment into the original string
|
||||||
is preserved.
|
is preserved.
|
||||||
|
|
||||||
text (unicode): The text to be processed.
|
text (unicode): The text to be processed.
|
||||||
|
@ -452,30 +454,10 @@ class Language(object):
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
def _format_docs_and_golds(self, docs, golds):
|
def update(self, examples, drop=0.0, sgd=None, losses=None, component_cfg=None):
|
||||||
"""Format golds and docs before update models."""
|
|
||||||
expected_keys = ("words", "tags", "heads", "deps", "entities", "cats", "links")
|
|
||||||
gold_objs = []
|
|
||||||
doc_objs = []
|
|
||||||
for doc, gold in zip(docs, golds):
|
|
||||||
if isinstance(doc, basestring_):
|
|
||||||
doc = self.make_doc(doc)
|
|
||||||
if not isinstance(gold, GoldParse):
|
|
||||||
unexpected = [k for k in gold if k not in expected_keys]
|
|
||||||
if unexpected:
|
|
||||||
err = Errors.E151.format(unexp=unexpected, exp=expected_keys)
|
|
||||||
raise ValueError(err)
|
|
||||||
gold = GoldParse(doc, **gold)
|
|
||||||
doc_objs.append(doc)
|
|
||||||
gold_objs.append(gold)
|
|
||||||
|
|
||||||
return doc_objs, gold_objs
|
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0.0, sgd=None, losses=None, component_cfg=None):
|
|
||||||
"""Update the models in the pipeline.
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
examples (iterable): A batch of `Example` or `Doc` objects.
|
||||||
golds (iterable): A batch of `GoldParse` objects.
|
|
||||||
drop (float): The dropout rate.
|
drop (float): The dropout rate.
|
||||||
sgd (callable): An optimizer.
|
sgd (callable): An optimizer.
|
||||||
losses (dict): Dictionary to update with the loss, keyed by component.
|
losses (dict): Dictionary to update with the loss, keyed by component.
|
||||||
|
@ -484,18 +466,16 @@ class Language(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#update
|
DOCS: https://spacy.io/api/language#update
|
||||||
"""
|
"""
|
||||||
if len(docs) != len(golds):
|
if len(examples) == 0:
|
||||||
raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
|
|
||||||
if len(docs) == 0:
|
|
||||||
return
|
return
|
||||||
|
examples = Example.to_example_objects(examples, make_doc=self.make_doc)
|
||||||
|
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = create_default_optimizer(Model.ops)
|
self._optimizer = create_default_optimizer(Model.ops)
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
# Allow dict of args to GoldParse, instead of GoldParse objects.
|
|
||||||
docs, golds = self._format_docs_and_golds(docs, golds)
|
|
||||||
grads = {}
|
|
||||||
|
|
||||||
|
grads = {}
|
||||||
def get_grads(W, dW, key=None):
|
def get_grads(W, dW, key=None):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
|
|
||||||
|
@ -512,18 +492,18 @@ class Language(object):
|
||||||
grads = {}
|
grads = {}
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
kwargs.setdefault("drop", drop)
|
kwargs.setdefault("drop", drop)
|
||||||
proc.update(docs, golds, sgd=get_grads, losses=losses, **kwargs)
|
proc.update(examples, sgd=get_grads, losses=losses, **kwargs)
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
|
|
||||||
def rehearse(self, docs, sgd=None, losses=None, config=None):
|
def rehearse(self, examples, sgd=None, losses=None, config=None):
|
||||||
"""Make a "rehearsal" update to the models in the pipeline, to prevent
|
"""Make a "rehearsal" update to the models in the pipeline, to prevent
|
||||||
forgetting. Rehearsal updates run an initial copy of the model over some
|
forgetting. Rehearsal updates run an initial copy of the model over some
|
||||||
data, and update the model so its current predictions are more like the
|
data, and update the model so its current predictions are more like the
|
||||||
initial ones. This is useful for keeping a pretrained model on-track,
|
initial ones. This is useful for keeping a pretrained model on-track,
|
||||||
even if you're updating it with a smaller set of examples.
|
even if you're updating it with a smaller set of examples.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
examples (iterable): A batch of `Doc` objects.
|
||||||
drop (float): The dropout rate.
|
drop (float): The dropout rate.
|
||||||
sgd (callable): An optimizer.
|
sgd (callable): An optimizer.
|
||||||
RETURNS (dict): Results from the update.
|
RETURNS (dict): Results from the update.
|
||||||
|
@ -531,22 +511,18 @@ class Language(object):
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
>>> raw_text_batches = minibatch(raw_texts)
|
>>> raw_text_batches = minibatch(raw_texts)
|
||||||
>>> for labelled_batch in minibatch(zip(train_docs, train_golds)):
|
>>> for labelled_batch in minibatch(zip(train_docs, train_golds)):
|
||||||
>>> docs, golds = zip(*train_docs)
|
>>> nlp.update(labelled_batch)
|
||||||
>>> nlp.update(docs, golds)
|
|
||||||
>>> raw_batch = [nlp.make_doc(text) for text in next(raw_text_batches)]
|
>>> raw_batch = [nlp.make_doc(text) for text in next(raw_text_batches)]
|
||||||
>>> nlp.rehearse(raw_batch)
|
>>> nlp.rehearse(raw_batch)
|
||||||
"""
|
"""
|
||||||
# TODO: document
|
# TODO: document
|
||||||
if len(docs) == 0:
|
if len(examples) == 0:
|
||||||
return
|
return
|
||||||
|
examples = Example.to_example_objects(examples, make_doc=self.make_doc)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
if self._optimizer is None:
|
if self._optimizer is None:
|
||||||
self._optimizer = create_default_optimizer(Model.ops)
|
self._optimizer = create_default_optimizer(Model.ops)
|
||||||
sgd = self._optimizer
|
sgd = self._optimizer
|
||||||
docs = list(docs)
|
|
||||||
for i, doc in enumerate(docs):
|
|
||||||
if isinstance(doc, basestring_):
|
|
||||||
docs[i] = self.make_doc(doc)
|
|
||||||
pipes = list(self.pipeline)
|
pipes = list(self.pipeline)
|
||||||
random.shuffle(pipes)
|
random.shuffle(pipes)
|
||||||
if config is None:
|
if config is None:
|
||||||
|
@ -563,44 +539,45 @@ class Language(object):
|
||||||
if not hasattr(proc, "rehearse"):
|
if not hasattr(proc, "rehearse"):
|
||||||
continue
|
continue
|
||||||
grads = {}
|
grads = {}
|
||||||
proc.rehearse(docs, sgd=get_grads, losses=losses, **config.get(name, {}))
|
proc.rehearse(examples, sgd=get_grads, losses=losses, **config.get(name, {}))
|
||||||
for key, (W, dW) in grads.items():
|
for key, (W, dW) in grads.items():
|
||||||
sgd(W, dW, key=key)
|
sgd(W, dW, key=key)
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def preprocess_gold(self, docs_golds):
|
def preprocess_gold(self, examples):
|
||||||
"""Can be called before training to pre-process gold data. By default,
|
"""Can be called before training to pre-process gold data. By default,
|
||||||
it handles nonprojectivity and adds missing tags to the tag map.
|
it handles nonprojectivity and adds missing tags to the tag map.
|
||||||
|
|
||||||
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
|
examples (iterable): `Example` objects.
|
||||||
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
|
YIELDS (tuple): `Example` objects.
|
||||||
"""
|
"""
|
||||||
for name, proc in self.pipeline:
|
for name, proc in self.pipeline:
|
||||||
if hasattr(proc, "preprocess_gold"):
|
if hasattr(proc, "preprocess_gold"):
|
||||||
docs_golds = proc.preprocess_gold(docs_golds)
|
examples = proc.preprocess_gold(examples)
|
||||||
for doc, gold in docs_golds:
|
for ex in examples:
|
||||||
yield doc, gold
|
yield ex
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples=None, sgd=None, component_cfg=None, **cfg):
|
def begin_training(self, get_examples=None, sgd=None, component_cfg=None, **cfg):
|
||||||
"""Allocate models, pre-process training data and acquire a trainer and
|
"""Allocate models, pre-process training data and acquire a trainer and
|
||||||
optimizer. Used as a contextmanager.
|
optimizer. Used as a contextmanager.
|
||||||
|
|
||||||
get_gold_tuples (function): Function returning gold data
|
get_examples (function): Function returning example training data (TODO: document format change since 3.0)
|
||||||
component_cfg (dict): Config parameters for specific components.
|
component_cfg (dict): Config parameters for specific components.
|
||||||
**cfg: Config parameters.
|
**cfg: Config parameters.
|
||||||
RETURNS: An optimizer.
|
RETURNS: An optimizer.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#begin_training
|
DOCS: https://spacy.io/api/language#begin_training
|
||||||
"""
|
"""
|
||||||
if get_gold_tuples is None:
|
# TODO: throw warning when get_gold_tuples is provided instead of get_examples
|
||||||
get_gold_tuples = lambda: []
|
if get_examples is None:
|
||||||
|
get_examples = lambda: []
|
||||||
# Populate vocab
|
# Populate vocab
|
||||||
else:
|
else:
|
||||||
for _, annots_brackets in get_gold_tuples():
|
for example in get_examples():
|
||||||
_ = annots_brackets.pop()
|
for token_annotation in example.token_annotations:
|
||||||
for annots, _ in annots_brackets:
|
for word in token_annotation.words:
|
||||||
for word in annots[1]:
|
|
||||||
_ = self.vocab[word] # noqa: F841
|
_ = self.vocab[word] # noqa: F841
|
||||||
|
|
||||||
if cfg.get("device", -1) >= 0:
|
if cfg.get("device", -1) >= 0:
|
||||||
util.use_gpu(cfg["device"])
|
util.use_gpu(cfg["device"])
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
|
@ -618,7 +595,7 @@ class Language(object):
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
kwargs.update(cfg)
|
kwargs.update(cfg)
|
||||||
proc.begin_training(
|
proc.begin_training(
|
||||||
get_gold_tuples,
|
get_examples,
|
||||||
pipeline=self.pipeline,
|
pipeline=self.pipeline,
|
||||||
sgd=self._optimizer,
|
sgd=self._optimizer,
|
||||||
**kwargs
|
**kwargs
|
||||||
|
@ -650,11 +627,11 @@ class Language(object):
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def evaluate(
|
def evaluate(
|
||||||
self, docs_golds, verbose=False, batch_size=256, scorer=None, component_cfg=None
|
self, examples, verbose=False, batch_size=256, scorer=None, component_cfg=None
|
||||||
):
|
):
|
||||||
"""Evaluate a model's pipeline components.
|
"""Evaluate a model's pipeline components.
|
||||||
|
|
||||||
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
|
examples (iterable): `Example` objects.
|
||||||
verbose (bool): Print debugging information.
|
verbose (bool): Print debugging information.
|
||||||
batch_size (int): Batch size to use.
|
batch_size (int): Batch size to use.
|
||||||
scorer (Scorer): Optional `Scorer` to use. If not passed in, a new one
|
scorer (Scorer): Optional `Scorer` to use. If not passed in, a new one
|
||||||
|
@ -665,30 +642,24 @@ class Language(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/language#evaluate
|
DOCS: https://spacy.io/api/language#evaluate
|
||||||
"""
|
"""
|
||||||
|
examples = Example.to_example_objects(examples, make_doc=self.make_doc)
|
||||||
if scorer is None:
|
if scorer is None:
|
||||||
scorer = Scorer(pipeline=self.pipeline)
|
scorer = Scorer(pipeline=self.pipeline)
|
||||||
if component_cfg is None:
|
if component_cfg is None:
|
||||||
component_cfg = {}
|
component_cfg = {}
|
||||||
docs, golds = zip(*docs_golds)
|
|
||||||
docs = [
|
|
||||||
self.make_doc(doc) if isinstance(doc, basestring_) else doc for doc in docs
|
|
||||||
]
|
|
||||||
golds = list(golds)
|
|
||||||
for name, pipe in self.pipeline:
|
for name, pipe in self.pipeline:
|
||||||
kwargs = component_cfg.get(name, {})
|
kwargs = component_cfg.get(name, {})
|
||||||
kwargs.setdefault("batch_size", batch_size)
|
kwargs.setdefault("batch_size", batch_size)
|
||||||
if not hasattr(pipe, "pipe"):
|
if not hasattr(pipe, "pipe"):
|
||||||
docs = _pipe(pipe, docs, kwargs)
|
examples = _pipe(pipe, examples, kwargs)
|
||||||
else:
|
else:
|
||||||
docs = pipe.pipe(docs, **kwargs)
|
examples = pipe.pipe(examples, as_example=True, **kwargs)
|
||||||
for doc, gold in zip(docs, golds):
|
for ex in examples:
|
||||||
if not isinstance(gold, GoldParse):
|
|
||||||
gold = GoldParse(doc, **gold)
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(doc)
|
print(ex.doc)
|
||||||
kwargs = component_cfg.get("scorer", {})
|
kwargs = component_cfg.get("scorer", {})
|
||||||
kwargs.setdefault("verbose", verbose)
|
kwargs.setdefault("verbose", verbose)
|
||||||
scorer.score(doc, gold, **kwargs)
|
scorer.score(ex, **kwargs)
|
||||||
return scorer
|
return scorer
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
|
@ -733,6 +704,7 @@ class Language(object):
|
||||||
cleanup=False,
|
cleanup=False,
|
||||||
component_cfg=None,
|
component_cfg=None,
|
||||||
n_process=1,
|
n_process=1,
|
||||||
|
as_example=False
|
||||||
):
|
):
|
||||||
"""Process texts as a stream, and yield `Doc` objects in order.
|
"""Process texts as a stream, and yield `Doc` objects in order.
|
||||||
|
|
||||||
|
@ -770,6 +742,7 @@ class Language(object):
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
disable=disable,
|
disable=disable,
|
||||||
component_cfg=component_cfg,
|
component_cfg=component_cfg,
|
||||||
|
as_example=False
|
||||||
)
|
)
|
||||||
for doc, context in izip(docs, contexts):
|
for doc, context in izip(docs, contexts):
|
||||||
yield (doc, context)
|
yield (doc, context)
|
||||||
|
@ -1095,15 +1068,15 @@ class DisabledPipes(list):
|
||||||
self[:] = []
|
self[:] = []
|
||||||
|
|
||||||
|
|
||||||
def _pipe(docs, proc, kwargs):
|
def _pipe(examples, proc, kwargs):
|
||||||
# We added some args for pipe that __call__ doesn't expect.
|
# We added some args for pipe that __call__ doesn't expect.
|
||||||
kwargs = dict(kwargs)
|
kwargs = dict(kwargs)
|
||||||
for arg in ["n_threads", "batch_size"]:
|
for arg in ["n_threads", "batch_size"]:
|
||||||
if arg in kwargs:
|
if arg in kwargs:
|
||||||
kwargs.pop(arg)
|
kwargs.pop(arg)
|
||||||
for doc in docs:
|
for ex in examples:
|
||||||
doc = proc(doc, **kwargs)
|
ex = proc(ex, **kwargs)
|
||||||
yield doc
|
yield ex
|
||||||
|
|
||||||
|
|
||||||
def _apply_pipes(make_doc, pipes, reciever, sender):
|
def _apply_pipes(make_doc, pipes, reciever, sender):
|
||||||
|
|
|
@ -97,18 +97,19 @@ class Morphologizer(Pipe):
|
||||||
if doc[j].morph.pos != 0:
|
if doc[j].morph.pos != 0:
|
||||||
doc.c[j].pos = doc[j].morph.pos
|
doc.c[j].pos = doc[j].morph.pos
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, examples, drop=0., sgd=None, losses=None):
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
|
|
||||||
|
docs = [self._get_doc(ex) for ex in examples]
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
|
tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
|
||||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
||||||
bp_tag_scores(d_tag_scores, sgd=sgd)
|
bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||||
|
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, examples, scores):
|
||||||
guesses = []
|
guesses = []
|
||||||
for doc_scores in scores:
|
for doc_scores in scores:
|
||||||
guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes))
|
guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes))
|
||||||
|
@ -122,7 +123,9 @@ class Morphologizer(Pipe):
|
||||||
# Do this on CPU, as we can't vectorize easily.
|
# Do this on CPU, as we can't vectorize easily.
|
||||||
target = numpy.zeros(scores.shape, dtype='f')
|
target = numpy.zeros(scores.shape, dtype='f')
|
||||||
field_sizes = self.model.softmax.out_sizes
|
field_sizes = self.model.softmax.out_sizes
|
||||||
for doc, gold in zip(docs, golds):
|
for example in examples:
|
||||||
|
doc = example.doc
|
||||||
|
gold = example.gold
|
||||||
for t, features in enumerate(gold.morphology):
|
for t, features in enumerate(gold.morphology):
|
||||||
if features is None:
|
if features is None:
|
||||||
target[idx] = scores[idx]
|
target[idx] = scores[idx]
|
||||||
|
@ -146,6 +149,7 @@ class Morphologizer(Pipe):
|
||||||
scores = self.model.ops.asarray(scores, dtype='f')
|
scores = self.model.ops.asarray(scores, dtype='f')
|
||||||
d_scores = scores - target
|
d_scores = scores - target
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
|
docs = [self._get_doc(ex) for ex in examples]
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
|
|
|
@ -13,6 +13,7 @@ from thinc.misc import LayerNorm
|
||||||
from thinc.neural.util import to_categorical
|
from thinc.neural.util import to_categorical
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
|
||||||
|
from spacy.gold import Example
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..syntax.nn_parser cimport Parser
|
from ..syntax.nn_parser cimport Parser
|
||||||
from ..syntax.ner cimport BiluoPushDown
|
from ..syntax.ner cimport BiluoPushDown
|
||||||
|
@ -59,11 +60,17 @@ class Pipe(object):
|
||||||
def from_nlp(cls, nlp, **cfg):
|
def from_nlp(cls, nlp, **cfg):
|
||||||
return cls(nlp.vocab, **cfg)
|
return cls(nlp.vocab, **cfg)
|
||||||
|
|
||||||
|
def _get_doc(self, example):
|
||||||
|
""" Use this method if the `example` method can be both a Doc or an Example """
|
||||||
|
if isinstance(example, Doc):
|
||||||
|
return example
|
||||||
|
return example.doc
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
"""Create a new pipe instance."""
|
"""Create a new pipe instance."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, example):
|
||||||
"""Apply the pipe to one document. The document is
|
"""Apply the pipe to one document. The document is
|
||||||
modified in-place, and returned.
|
modified in-place, and returned.
|
||||||
|
|
||||||
|
@ -71,12 +78,16 @@ class Pipe(object):
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
self.require_model()
|
self.require_model()
|
||||||
|
doc = self._get_doc(example)
|
||||||
predictions = self.predict([doc])
|
predictions = self.predict([doc])
|
||||||
if isinstance(predictions, tuple) and len(predictions) == 2:
|
if isinstance(predictions, tuple) and len(predictions) == 2:
|
||||||
scores, tensors = predictions
|
scores, tensors = predictions
|
||||||
self.set_annotations([doc], scores, tensors=tensors)
|
self.set_annotations([doc], scores, tensors=tensors)
|
||||||
else:
|
else:
|
||||||
self.set_annotations([doc], predictions)
|
self.set_annotations([doc], predictions)
|
||||||
|
if isinstance(example, Example):
|
||||||
|
example.doc = doc
|
||||||
|
return example
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def require_model(self):
|
def require_model(self):
|
||||||
|
@ -84,20 +95,29 @@ class Pipe(object):
|
||||||
if getattr(self, "model", None) in (None, True, False):
|
if getattr(self, "model", None) in (None, True, False):
|
||||||
raise ValueError(Errors.E109.format(name=self.name))
|
raise ValueError(Errors.E109.format(name=self.name))
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
"""Apply the pipe to a stream of documents.
|
"""Apply the pipe to a stream of documents.
|
||||||
|
|
||||||
Both __call__ and pipe should delegate to the `predict()`
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for examples in util.minibatch(stream, size=batch_size):
|
||||||
docs = list(docs)
|
examples = list(examples)
|
||||||
|
docs = [self._get_doc(ex) for ex in examples]
|
||||||
predictions = self.predict(docs)
|
predictions = self.predict(docs)
|
||||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||||
scores, tensors = predictions
|
scores, tensors = predictions
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
else:
|
else:
|
||||||
self.set_annotations(docs, predictions)
|
self.set_annotations(docs, predictions)
|
||||||
|
|
||||||
|
if as_example:
|
||||||
|
examples = []
|
||||||
|
for ex, doc in zip(examples, docs):
|
||||||
|
ex.doc = doc
|
||||||
|
examples.append(ex)
|
||||||
|
yield from examples
|
||||||
|
else:
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -111,7 +131,7 @@ class Pipe(object):
|
||||||
"""Modify a batch of documents, using pre-computed scores."""
|
"""Modify a batch of documents, using pre-computed scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0.0, sgd=None, losses=None):
|
def update(self, examples, drop=0.0, sgd=None, losses=None):
|
||||||
"""Learn from a batch of documents and gold-standard information,
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
updating the pipe's model.
|
updating the pipe's model.
|
||||||
|
|
||||||
|
@ -119,12 +139,12 @@ class Pipe(object):
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def rehearse(self, docs, sgd=None, losses=None, **config):
|
def rehearse(self, examples, sgd=None, losses=None, **config):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, examples, scores):
|
||||||
"""Find the loss and gradient of loss for the batch of
|
"""Find the loss and gradient of loss for the batch of
|
||||||
documents and their predicted scores."""
|
examples (with embedded docs) and their predicted scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
|
@ -140,7 +160,7 @@ class Pipe(object):
|
||||||
return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {}))
|
return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {}))
|
||||||
|
|
||||||
def begin_training(
|
def begin_training(
|
||||||
self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs
|
self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs
|
||||||
):
|
):
|
||||||
"""Initialize the pipe for training, using data exampes if available.
|
"""Initialize the pipe for training, using data exampes if available.
|
||||||
If no model has been initialized yet, the model is added."""
|
If no model has been initialized yet, the model is added."""
|
||||||
|
@ -264,28 +284,40 @@ class Tensorizer(Pipe):
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault("cnn_maxout_pieces", 3)
|
self.cfg.setdefault("cnn_maxout_pieces", 3)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, example):
|
||||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||||
model. Vectors are set to the `Doc.tensor` attribute.
|
model. Vectors are set to the `Doc.tensor` attribute.
|
||||||
|
|
||||||
docs (Doc or iterable): One or more documents to add vectors to.
|
docs (Doc or iterable): One or more documents to add vectors to.
|
||||||
RETURNS (dict or None): Intermediate computations.
|
RETURNS (dict or None): Intermediate computations.
|
||||||
"""
|
"""
|
||||||
|
doc = self._get_doc(example)
|
||||||
tokvecses = self.predict([doc])
|
tokvecses = self.predict([doc])
|
||||||
self.set_annotations([doc], tokvecses)
|
self.set_annotations([doc], tokvecses)
|
||||||
|
if isinstance(example, Example):
|
||||||
|
example.doc = doc
|
||||||
|
return example
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
"""Process `Doc` objects as a stream.
|
"""Process `Doc` objects as a stream.
|
||||||
|
|
||||||
stream (iterator): A sequence of `Doc` objects to process.
|
stream (iterator): A sequence of `Doc` or `Example` objects to process.
|
||||||
batch_size (int): Number of `Doc` objects to group.
|
batch_size (int): Number of `Doc` or `Example` objects to group.
|
||||||
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
|
YIELDS (iterator): A sequence of `Doc` or `Example` objects, in order of input.
|
||||||
"""
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for examples in util.minibatch(stream, size=batch_size):
|
||||||
docs = list(docs)
|
docs = [self._get_doc(ex) for ex in examples]
|
||||||
tensors = self.predict(docs)
|
tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, tensors)
|
self.set_annotations(docs, tensors)
|
||||||
|
|
||||||
|
if as_example:
|
||||||
|
examples = []
|
||||||
|
for ex, doc in zip(examples, docs):
|
||||||
|
ex.doc = doc
|
||||||
|
examples.append(ex)
|
||||||
|
yield from examples
|
||||||
|
else:
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -310,7 +342,7 @@ class Tensorizer(Pipe):
|
||||||
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
|
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
|
||||||
doc.tensor = tensor
|
doc.tensor = tensor
|
||||||
|
|
||||||
def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
|
def update(self, examples, state=None, drop=0.0, sgd=None, losses=None):
|
||||||
"""Update the model.
|
"""Update the model.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -320,17 +352,16 @@ class Tensorizer(Pipe):
|
||||||
RETURNS (dict): Results from the update.
|
RETURNS (dict): Results from the update.
|
||||||
"""
|
"""
|
||||||
self.require_model()
|
self.require_model()
|
||||||
if isinstance(docs, Doc):
|
examples = Example.to_example_objects(examples)
|
||||||
docs = [docs]
|
|
||||||
inputs = []
|
inputs = []
|
||||||
bp_inputs = []
|
bp_inputs = []
|
||||||
for tok2vec in self.input_models:
|
for tok2vec in self.input_models:
|
||||||
tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)
|
tensor, bp_tensor = tok2vec.begin_update([ex.doc for ex in examples], drop=drop)
|
||||||
inputs.append(tensor)
|
inputs.append(tensor)
|
||||||
bp_inputs.append(bp_tensor)
|
bp_inputs.append(bp_tensor)
|
||||||
inputs = self.model.ops.xp.hstack(inputs)
|
inputs = self.model.ops.xp.hstack(inputs)
|
||||||
scores, bp_scores = self.model.begin_update(inputs, drop=drop)
|
scores, bp_scores = self.model.begin_update(inputs, drop=drop)
|
||||||
loss, d_scores = self.get_loss(docs, golds, scores)
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
d_inputs = bp_scores(d_scores, sgd=sgd)
|
d_inputs = bp_scores(d_scores, sgd=sgd)
|
||||||
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
|
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
|
||||||
for d_input, bp_input in zip(d_inputs, bp_inputs):
|
for d_input, bp_input in zip(d_inputs, bp_inputs):
|
||||||
|
@ -340,18 +371,19 @@ class Tensorizer(Pipe):
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def get_loss(self, docs, golds, prediction):
|
def get_loss(self, examples, prediction):
|
||||||
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
examples = Example.to_example_objects(examples)
|
||||||
|
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
|
||||||
target = self.vocab.vectors.data[ids]
|
target = self.vocab.vectors.data[ids]
|
||||||
d_scores = (prediction - target) / prediction.shape[0]
|
d_scores = (prediction - target) / prediction.shape[0]
|
||||||
loss = (d_scores ** 2).sum()
|
loss = (d_scores ** 2).sum()
|
||||||
return loss, d_scores
|
return loss, d_scores
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||||
"""Allocate models, pre-process training data and acquire an
|
"""Allocate models, pre-process training data and acquire an
|
||||||
optimizer.
|
optimizer.
|
||||||
|
|
||||||
gold_tuples (iterable): Gold-standard training data.
|
get_examples (iterable): Gold-standard training data.
|
||||||
pipeline (list): The pipeline the model is part of.
|
pipeline (list): The pipeline the model is part of.
|
||||||
"""
|
"""
|
||||||
if pipeline is not None:
|
if pipeline is not None:
|
||||||
|
@ -391,16 +423,29 @@ class Tagger(Pipe):
|
||||||
else:
|
else:
|
||||||
return chain(self.model.tok2vec, flatten)
|
return chain(self.model.tok2vec, flatten)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, example):
|
||||||
|
doc = self._get_doc(example)
|
||||||
tags, tokvecs = self.predict([doc])
|
tags, tokvecs = self.predict([doc])
|
||||||
self.set_annotations([doc], tags, tensors=tokvecs)
|
self.set_annotations([doc], tags, tensors=tokvecs)
|
||||||
|
if isinstance(example, Example):
|
||||||
|
example.doc = doc
|
||||||
|
return example
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for examples in util.minibatch(stream, size=batch_size):
|
||||||
docs = list(docs)
|
examples = list(examples)
|
||||||
|
docs = [self._get_doc(ex) for ex in examples]
|
||||||
tag_ids, tokvecs = self.predict(docs)
|
tag_ids, tokvecs = self.predict(docs)
|
||||||
self.set_annotations(docs, tag_ids, tensors=tokvecs)
|
self.set_annotations(docs, tag_ids, tensors=tokvecs)
|
||||||
|
|
||||||
|
if as_example:
|
||||||
|
examples = []
|
||||||
|
for ex, doc in zip(examples, docs):
|
||||||
|
ex.doc = doc
|
||||||
|
examples.append(ex)
|
||||||
|
yield from examples
|
||||||
|
else:
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -452,47 +497,51 @@ class Tagger(Pipe):
|
||||||
doc.extend_tensor(tensors[i])
|
doc.extend_tensor(tensors[i])
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, examples, drop=0., sgd=None, losses=None):
|
||||||
self.require_model()
|
self.require_model()
|
||||||
|
examples = Example.to_example_objects(examples)
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
|
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
|
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
|
tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop)
|
||||||
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
|
loss, d_tag_scores = self.get_loss(examples, tag_scores)
|
||||||
bp_tag_scores(d_tag_scores, sgd=sgd)
|
bp_tag_scores(d_tag_scores, sgd=sgd)
|
||||||
|
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
|
|
||||||
def rehearse(self, docs, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
"""Perform a 'rehearsal' update, where we try to match the output of
|
"""Perform a 'rehearsal' update, where we try to match the output of
|
||||||
an initial model.
|
an initial model.
|
||||||
"""
|
"""
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return
|
return
|
||||||
|
examples = Example.to_example_objects(examples)
|
||||||
|
docs = [ex.doc for ex in examples]
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
guesses, backprop = self.model.begin_update(docs, drop=drop)
|
guesses, backprop = self.model.begin_update(docs, drop=drop)
|
||||||
target = self._rehearsal_model(docs)
|
target = self._rehearsal_model(examples)
|
||||||
gradient = guesses - target
|
gradient = guesses - target
|
||||||
backprop(gradient, sgd=sgd)
|
backprop(gradient, sgd=sgd)
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
losses[self.name] += (gradient**2).sum()
|
losses[self.name] += (gradient**2).sum()
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, examples, scores):
|
||||||
scores = self.model.ops.flatten(scores)
|
scores = self.model.ops.flatten(scores)
|
||||||
tag_index = {tag: i for i, tag in enumerate(self.labels)}
|
tag_index = {tag: i for i, tag in enumerate(self.labels)}
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
|
||||||
for gold in golds:
|
for ex in examples:
|
||||||
|
gold = ex.gold
|
||||||
for tag in gold.tags:
|
for tag in gold.tags:
|
||||||
if tag is None:
|
if tag is None:
|
||||||
correct[idx] = guesses[idx]
|
correct[idx] = guesses[idx]
|
||||||
|
@ -506,20 +555,20 @@ class Tagger(Pipe):
|
||||||
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
|
||||||
d_scores *= self.model.ops.asarray(known_labels)
|
d_scores *= self.model.ops.asarray(known_labels)
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
|
docs = [ex.doc for ex in examples]
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
|
||||||
if not any(table in self.vocab.lookups for table in lemma_tables):
|
if not any(table in self.vocab.lookups for table in lemma_tables):
|
||||||
user_warning(Warnings.W022)
|
user_warning(Warnings.W022)
|
||||||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||||
new_tag_map = OrderedDict()
|
new_tag_map = OrderedDict()
|
||||||
for raw_text, annots_brackets in get_gold_tuples():
|
for example in get_examples():
|
||||||
for annots, brackets in annots_brackets:
|
for token_annotation in example.token_annotations:
|
||||||
ids, words, tags, heads, deps, ents = annots
|
for tag in token_annotation.tags:
|
||||||
for tag in tags:
|
|
||||||
if tag in orig_tag_map:
|
if tag in orig_tag_map:
|
||||||
new_tag_map[tag] = orig_tag_map[tag]
|
new_tag_map[tag] = orig_tag_map[tag]
|
||||||
else:
|
else:
|
||||||
|
@ -698,14 +747,14 @@ class MultitaskObjective(Tagger):
|
||||||
def set_annotations(self, docs, dep_ids, tensors=None):
|
def set_annotations(self, docs, dep_ids, tensors=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, tok2vec=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None, tok2vec=None,
|
||||||
sgd=None, **kwargs):
|
sgd=None, **kwargs):
|
||||||
gold_tuples = nonproj.preprocess_training_data(get_gold_tuples())
|
gold_examples = nonproj.preprocess_training_data(get_examples())
|
||||||
for raw_text, annots_brackets in gold_tuples:
|
# for raw_text, doc_annot in gold_tuples:
|
||||||
for annots, brackets in annots_brackets:
|
for example in gold_examples:
|
||||||
ids, words, tags, heads, deps, ents = annots
|
for token_annotation in example.token_annotations:
|
||||||
for i in range(len(ids)):
|
for i in range(len(token_annotation.ids)):
|
||||||
label = self.make_label(i, words, tags, heads, deps, ents)
|
label = self.make_label(i, token_annotation)
|
||||||
if label is not None and label not in self.labels:
|
if label is not None and label not in self.labels:
|
||||||
self.labels[label] = len(self.labels)
|
self.labels[label] = len(self.labels)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
|
@ -735,18 +784,17 @@ class MultitaskObjective(Tagger):
|
||||||
scores = self.model.softmax(tokvecs)
|
scores = self.model.softmax(tokvecs)
|
||||||
return tokvecs, scores
|
return tokvecs, scores
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, examples, scores):
|
||||||
if len(docs) != len(golds):
|
|
||||||
raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs),
|
|
||||||
n_golds=len(golds)))
|
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
|
golds = [ex.gold for ex in examples]
|
||||||
|
docs = [ex.doc for ex in examples]
|
||||||
for i, gold in enumerate(golds):
|
for i, gold in enumerate(golds):
|
||||||
for j in range(len(docs[i])):
|
for j in range(len(docs[i])):
|
||||||
# Handes alignment for tokenization differences
|
# Handels alignment for tokenization differences
|
||||||
label = self.make_label(j, gold.words, gold.tags,
|
token_annotation = gold.get_token_annotation()
|
||||||
gold.heads, gold.labels, gold.ents)
|
label = self.make_label(j, token_annotation)
|
||||||
if label is None or label not in self.labels:
|
if label is None or label not in self.labels:
|
||||||
correct[idx] = guesses[idx]
|
correct[idx] = guesses[idx]
|
||||||
else:
|
else:
|
||||||
|
@ -758,39 +806,39 @@ class MultitaskObjective(Tagger):
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_dep(i, words, tags, heads, deps, ents):
|
def make_dep(i, token_annotation):
|
||||||
if deps[i] is None or heads[i] is None:
|
if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
|
||||||
return None
|
return None
|
||||||
return deps[i]
|
return token_annotation.deps[i]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_tag(i, words, tags, heads, deps, ents):
|
def make_tag(i, token_annotation):
|
||||||
return tags[i]
|
return token_annotation.tags[i]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_ent(i, words, tags, heads, deps, ents):
|
def make_ent(i, token_annotation):
|
||||||
if ents is None:
|
if token_annotation.entities is None:
|
||||||
return None
|
return None
|
||||||
return ents[i]
|
return token_annotation.entities[i]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_dep_tag_offset(i, words, tags, heads, deps, ents):
|
def make_dep_tag_offset(i, token_annotation):
|
||||||
if deps[i] is None or heads[i] is None:
|
if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
|
||||||
return None
|
return None
|
||||||
offset = heads[i] - i
|
offset = token_annotation.heads[i] - i
|
||||||
offset = min(offset, 2)
|
offset = min(offset, 2)
|
||||||
offset = max(offset, -2)
|
offset = max(offset, -2)
|
||||||
return "%s-%s:%d" % (deps[i], tags[i], offset)
|
return "%s-%s:%d" % (token_annotation.deps[i], token_annotation.tags[i], offset)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_ent_tag(i, words, tags, heads, deps, ents):
|
def make_ent_tag(i, token_annotation):
|
||||||
if ents is None or ents[i] is None:
|
if token_annotation.entities is None or token_annotation.entities[i] is None:
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
return "%s-%s" % (tags[i], ents[i])
|
return "%s-%s" % (token_annotation.tags[i], token_annotation.entities[i])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_sent_start(target, words, tags, heads, deps, ents, cache=True, _cache={}):
|
def make_sent_start(target, token_annotation, cache=True, _cache={}):
|
||||||
"""A multi-task objective for representing sentence boundaries,
|
"""A multi-task objective for representing sentence boundaries,
|
||||||
using BILU scheme. (O is impossible)
|
using BILU scheme. (O is impossible)
|
||||||
|
|
||||||
|
@ -799,6 +847,8 @@ class MultitaskObjective(Tagger):
|
||||||
of gold data. You can pass cache=False if you know the cache will
|
of gold data. You can pass cache=False if you know the cache will
|
||||||
do the wrong thing.
|
do the wrong thing.
|
||||||
"""
|
"""
|
||||||
|
words = token_annotation.words
|
||||||
|
heads = token_annotation.heads
|
||||||
assert len(words) == len(heads)
|
assert len(words) == len(heads)
|
||||||
assert target < len(words), (target, len(words))
|
assert target < len(words), (target, len(words))
|
||||||
if cache:
|
if cache:
|
||||||
|
@ -857,7 +907,7 @@ class ClozeMultitask(Pipe):
|
||||||
def set_annotations(self, docs, dep_ids, tensors=None):
|
def set_annotations(self, docs, dep_ids, tensors=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None,
|
def begin_training(self, get_examples=lambda: [], pipeline=None,
|
||||||
tok2vec=None, sgd=None, **kwargs):
|
tok2vec=None, sgd=None, **kwargs):
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
|
@ -874,25 +924,26 @@ class ClozeMultitask(Pipe):
|
||||||
vectors = self.model.output_layer(tokvecs)
|
vectors = self.model.output_layer(tokvecs)
|
||||||
return tokvecs, vectors
|
return tokvecs, vectors
|
||||||
|
|
||||||
def get_loss(self, docs, vectors, prediction):
|
def get_loss(self, examples, vectors, prediction):
|
||||||
# The simplest way to implement this would be to vstack the
|
# The simplest way to implement this would be to vstack the
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||||
# and look them up all at once. This prevents data copying.
|
# and look them up all at once. This prevents data copying.
|
||||||
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
|
||||||
target = vectors[ids]
|
target = vectors[ids]
|
||||||
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
|
loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
|
||||||
return float(loss), gradient
|
return float(loss), gradient
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, examples, drop=0., sgd=None, losses=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def rehearse(self, docs, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
self.require_model()
|
self.require_model()
|
||||||
|
examples = Example.to_example_objects(examples)
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
predictions, bp_predictions = self.model.begin_update(docs, drop=drop)
|
predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples], drop=drop)
|
||||||
loss, d_predictions = self.get_loss(docs, self.vocab.vectors.data, predictions)
|
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
||||||
bp_predictions(d_predictions, sgd=sgd)
|
bp_predictions(d_predictions, sgd=sgd)
|
||||||
|
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
|
@ -947,11 +998,20 @@ class TextCategorizer(Pipe):
|
||||||
def labels(self, value):
|
def labels(self, value):
|
||||||
self.cfg["labels"] = tuple(value)
|
self.cfg["labels"] = tuple(value)
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for examples in util.minibatch(stream, size=batch_size):
|
||||||
docs = list(docs)
|
examples = list(examples)
|
||||||
|
docs = [self._get_doc(ex) for ex in examples]
|
||||||
scores, tensors = self.predict(docs)
|
scores, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
|
|
||||||
|
if as_example:
|
||||||
|
examples = []
|
||||||
|
for ex, doc in zip(examples, docs):
|
||||||
|
ex.doc = doc
|
||||||
|
examples.append(ex)
|
||||||
|
yield from examples
|
||||||
|
else:
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -973,33 +1033,37 @@ class TextCategorizer(Pipe):
|
||||||
for j, label in enumerate(self.labels):
|
for j, label in enumerate(self.labels):
|
||||||
doc.cats[label] = float(scores[i, j])
|
doc.cats[label] = float(scores[i, j])
|
||||||
|
|
||||||
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
def update(self, examples, state=None, drop=0., sgd=None, losses=None):
|
||||||
self.require_model()
|
self.require_model()
|
||||||
if not any(len(doc) for doc in docs):
|
examples = Example.to_example_objects(examples)
|
||||||
|
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
scores, bp_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop)
|
||||||
loss, d_scores = self.get_loss(docs, golds, scores)
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
bp_scores(d_scores, sgd=sgd)
|
bp_scores(d_scores, sgd=sgd)
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
|
|
||||||
def rehearse(self, docs, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return
|
return
|
||||||
|
examples = Example.to_example_objects(examples)
|
||||||
|
docs=[ex.doc for ex in examples]
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
||||||
target = self._rehearsal_model(docs)
|
target = self._rehearsal_model(examples)
|
||||||
gradient = scores - target
|
gradient = scores - target
|
||||||
bp_scores(gradient, sgd=sgd)
|
bp_scores(gradient, sgd=sgd)
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
losses[self.name] += (gradient**2).sum()
|
losses[self.name] += (gradient**2).sum()
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, examples, scores):
|
||||||
|
golds = [ex.gold for ex in examples]
|
||||||
truths = numpy.zeros((len(golds), len(self.labels)), dtype="f")
|
truths = numpy.zeros((len(golds), len(self.labels)), dtype="f")
|
||||||
not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f")
|
not_missing = numpy.ones((len(golds), len(self.labels)), dtype="f")
|
||||||
for i, gold in enumerate(golds):
|
for i, gold in enumerate(golds):
|
||||||
|
@ -1032,10 +1096,9 @@ class TextCategorizer(Pipe):
|
||||||
self.labels = tuple(list(self.labels) + [label])
|
self.labels = tuple(list(self.labels) + [label])
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||||
for raw_text, annot_brackets in get_gold_tuples():
|
for example in get_examples():
|
||||||
for _, (cats, _2) in annot_brackets:
|
for cat in example.doc_annotation.cats:
|
||||||
for cat in cats:
|
|
||||||
self.add_label(cat)
|
self.add_label(cat)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
|
self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
|
||||||
|
@ -1074,10 +1137,10 @@ cdef class DependencyParser(Parser):
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
labeller = MultitaskObjective(self.vocab, target=target)
|
||||||
self._multitasks.append(labeller)
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
tok2vec = self.model.tok2vec
|
tok2vec = self.model.tok2vec
|
||||||
labeller.begin_training(get_gold_tuples, pipeline=pipeline,
|
labeller.begin_training(get_examples, pipeline=pipeline,
|
||||||
tok2vec=tok2vec, sgd=sgd)
|
tok2vec=tok2vec, sgd=sgd)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
@ -1116,10 +1179,10 @@ cdef class EntityRecognizer(Parser):
|
||||||
labeller = MultitaskObjective(self.vocab, target=target)
|
labeller = MultitaskObjective(self.vocab, target=target)
|
||||||
self._multitasks.append(labeller)
|
self._multitasks.append(labeller)
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_gold_tuples, pipeline, sgd=None, **cfg):
|
def init_multitask_objectives(self, get_examples, pipeline, sgd=None, **cfg):
|
||||||
for labeller in self._multitasks:
|
for labeller in self._multitasks:
|
||||||
tok2vec = self.model.tok2vec
|
tok2vec = self.model.tok2vec
|
||||||
labeller.begin_training(get_gold_tuples, pipeline=pipeline,
|
labeller.begin_training(get_examples, pipeline=pipeline,
|
||||||
tok2vec=tok2vec)
|
tok2vec=tok2vec)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
|
@ -1175,7 +1238,7 @@ class EntityLinker(Pipe):
|
||||||
if getattr(self, "kb", None) in (None, True, False):
|
if getattr(self, "kb", None) in (None, True, False):
|
||||||
raise ValueError(Errors.E139.format(name=self.name))
|
raise ValueError(Errors.E139.format(name=self.name))
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
self.cfg["entity_width"] = self.kb.entity_vector_length
|
self.cfg["entity_width"] = self.kb.entity_vector_length
|
||||||
|
|
||||||
|
@ -1187,25 +1250,18 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
return sgd
|
return sgd
|
||||||
|
|
||||||
def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
|
def update(self, examples, state=None, drop=0.0, sgd=None, losses=None):
|
||||||
self.require_model()
|
self.require_model()
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
|
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
|
if not examples:
|
||||||
if not docs or not golds:
|
|
||||||
return 0
|
return 0
|
||||||
|
examples = Example.to_example_objects(examples)
|
||||||
if len(docs) != len(golds):
|
|
||||||
raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs),
|
|
||||||
n_golds=len(golds)))
|
|
||||||
|
|
||||||
if isinstance(docs, Doc):
|
|
||||||
docs = [docs]
|
|
||||||
golds = [golds]
|
|
||||||
|
|
||||||
sentence_docs = []
|
sentence_docs = []
|
||||||
|
docs = [ex.doc for ex in examples]
|
||||||
|
golds = [ex.gold for ex in examples]
|
||||||
|
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
ents_by_offset = dict()
|
ents_by_offset = dict()
|
||||||
|
@ -1219,19 +1275,19 @@ class EntityLinker(Pipe):
|
||||||
ent = ents_by_offset[(start, end)]
|
ent = ents_by_offset[(start, end)]
|
||||||
|
|
||||||
for kb_id, value in kb_dict.items():
|
for kb_id, value in kb_dict.items():
|
||||||
# Currently only training on the positive instances
|
# Currently only training on the positive instances - we assume there is at least 1 per doc/gold
|
||||||
if value:
|
if value:
|
||||||
sentence_docs.append(ent.sent.as_doc())
|
sentence_docs.append(ent.sent.as_doc())
|
||||||
|
|
||||||
sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
|
sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
|
||||||
loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)
|
loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds)
|
||||||
bp_context(d_scores, sgd=sgd)
|
bp_context(d_scores, sgd=sgd)
|
||||||
|
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def get_similarity_loss(self, docs, golds, scores):
|
def get_similarity_loss(self, golds, scores):
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
for entity, kb_dict in gold.links.items():
|
for entity, kb_dict in gold.links.items():
|
||||||
|
@ -1244,16 +1300,16 @@ class EntityLinker(Pipe):
|
||||||
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
||||||
|
|
||||||
if scores.shape != entity_encodings.shape:
|
if scores.shape != entity_encodings.shape:
|
||||||
raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up"))
|
raise RuntimeError(Errors.E147.format(method="get_similarity_loss", msg="gold entities do not match up"))
|
||||||
|
|
||||||
loss, gradients = get_cossim_loss(yh=scores, y=entity_encodings)
|
loss, gradients = get_cossim_loss(yh=scores, y=entity_encodings)
|
||||||
loss = loss / len(entity_encodings)
|
loss = loss / len(entity_encodings)
|
||||||
return loss, gradients
|
return loss, gradients
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, examples, scores):
|
||||||
cats = []
|
cats = []
|
||||||
for gold in golds:
|
for ex in examples:
|
||||||
for entity, kb_dict in gold.links.items():
|
for entity, kb_dict in ex.gold.links.items():
|
||||||
for kb_id, value in kb_dict.items():
|
for kb_id, value in kb_dict.items():
|
||||||
cats.append([value])
|
cats.append([value])
|
||||||
|
|
||||||
|
@ -1266,16 +1322,29 @@ class EntityLinker(Pipe):
|
||||||
loss = loss / len(cats)
|
loss = loss / len(cats)
|
||||||
return loss, d_scores
|
return loss, d_scores
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, example):
|
||||||
|
doc = self._get_doc(example)
|
||||||
kb_ids, tensors = self.predict([doc])
|
kb_ids, tensors = self.predict([doc])
|
||||||
self.set_annotations([doc], kb_ids, tensors=tensors)
|
self.set_annotations([doc], kb_ids, tensors=tensors)
|
||||||
|
if isinstance(example, Example):
|
||||||
|
example.doc = doc
|
||||||
|
return example
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for examples in util.minibatch(stream, size=batch_size):
|
||||||
docs = list(docs)
|
examples = list(examples)
|
||||||
|
docs = [self._get_doc(ex) for ex in examples]
|
||||||
kb_ids, tensors = self.predict(docs)
|
kb_ids, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, kb_ids, tensors=tensors)
|
self.set_annotations(docs, kb_ids, tensors=tensors)
|
||||||
|
|
||||||
|
if as_example:
|
||||||
|
examples = []
|
||||||
|
for ex, doc in zip(examples, docs):
|
||||||
|
ex.doc = doc
|
||||||
|
examples.append(ex)
|
||||||
|
yield from examples
|
||||||
|
else:
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -1408,7 +1477,7 @@ class EntityLinker(Pipe):
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def rehearse(self, docs, sgd=None, losses=None, **config):
|
def rehearse(self, examples, sgd=None, losses=None, **config):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
|
@ -1416,7 +1485,7 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
|
|
||||||
@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
|
@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
|
||||||
class Sentencizer(object):
|
class Sentencizer(Pipe):
|
||||||
"""Segment the Doc into sentences using a rule-based strategy.
|
"""Segment the Doc into sentences using a rule-based strategy.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer
|
DOCS: https://spacy.io/api/sentencizer
|
||||||
|
@ -1451,14 +1520,15 @@ class Sentencizer(object):
|
||||||
def from_nlp(cls, nlp, **cfg):
|
def from_nlp(cls, nlp, **cfg):
|
||||||
return cls(**cfg)
|
return cls(**cfg)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, example):
|
||||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||||
|
|
||||||
doc (Doc): The document to process.
|
example (Doc or Example): The document to process.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc or Example): The processed Doc or Example.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#call
|
DOCS: https://spacy.io/api/sentencizer#call
|
||||||
"""
|
"""
|
||||||
|
doc = self._get_doc(example)
|
||||||
start = 0
|
start = 0
|
||||||
seen_period = False
|
seen_period = False
|
||||||
for i, token in enumerate(doc):
|
for i, token in enumerate(doc):
|
||||||
|
@ -1472,6 +1542,9 @@ class Sentencizer(object):
|
||||||
seen_period = True
|
seen_period = True
|
||||||
if start < len(doc):
|
if start < len(doc):
|
||||||
doc[start].is_sent_start = True
|
doc[start].is_sent_start = True
|
||||||
|
if isinstance(example, Example):
|
||||||
|
example.doc = doc
|
||||||
|
return example
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
def to_bytes(self, **kwargs):
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import division, print_function, unicode_literals
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .gold import tags_to_entities, GoldParse
|
from .gold import tags_to_entities, GoldParse, DocAnnotation
|
||||||
from .errors import Errors
|
from .errors import Errors
|
||||||
|
|
||||||
|
|
||||||
|
@ -217,11 +217,10 @@ class Scorer(object):
|
||||||
"textcats_per_cat": self.textcats_per_cat,
|
"textcats_per_cat": self.textcats_per_cat,
|
||||||
}
|
}
|
||||||
|
|
||||||
def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")):
|
def score(self, example, verbose=False, punct_labels=("p", "punct")):
|
||||||
"""Update the evaluation scores from a single Doc / GoldParse pair.
|
"""Update the evaluation scores from a single Doc / GoldParse pair.
|
||||||
|
|
||||||
doc (Doc): The predicted annotations.
|
example (Example): The predicted annotations + correct annotations.
|
||||||
gold (GoldParse): The correct annotations.
|
|
||||||
verbose (bool): Print debugging information.
|
verbose (bool): Print debugging information.
|
||||||
punct_labels (tuple): Dependency labels for punctuation. Used to
|
punct_labels (tuple): Dependency labels for punctuation. Used to
|
||||||
evaluate dependency attachments to punctuation if `eval_punct` is
|
evaluate dependency attachments to punctuation if `eval_punct` is
|
||||||
|
@ -229,15 +228,22 @@ class Scorer(object):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/scorer#score
|
DOCS: https://spacy.io/api/scorer#score
|
||||||
"""
|
"""
|
||||||
|
if isinstance(example, tuple) and len(example) == 2:
|
||||||
|
doc, gold = example
|
||||||
|
else:
|
||||||
|
gold = example.gold
|
||||||
|
doc = example.doc
|
||||||
|
|
||||||
if len(doc) != len(gold):
|
if len(doc) != len(gold):
|
||||||
gold = GoldParse.from_annot_tuples(
|
doc_annotation = DocAnnotation(cats=gold.cats)
|
||||||
doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
|
token_annotation = gold.orig
|
||||||
)
|
gold = GoldParse.from_annotation(doc, doc_annotation, [token_annotation])
|
||||||
|
orig = gold.orig
|
||||||
gold_deps = set()
|
gold_deps = set()
|
||||||
gold_deps_per_dep = {}
|
gold_deps_per_dep = {}
|
||||||
gold_tags = set()
|
gold_tags = set()
|
||||||
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
|
gold_ents = set(tags_to_entities(orig.entities))
|
||||||
for id_, word, tag, head, dep, ner in gold.orig_annot:
|
for id_, tag, head, dep in zip(orig.ids, orig.tags, orig.heads, orig.deps):
|
||||||
gold_tags.add((id_, tag))
|
gold_tags.add((id_, tag))
|
||||||
if dep not in (None, "") and dep.lower() not in punct_labels:
|
if dep not in (None, "") and dep.lower() not in punct_labels:
|
||||||
gold_deps.add((id_, head, dep.lower()))
|
gold_deps.add((id_, head, dep.lower()))
|
||||||
|
@ -272,7 +278,7 @@ class Scorer(object):
|
||||||
if token.dep_.lower() not in cand_deps_per_dep:
|
if token.dep_.lower() not in cand_deps_per_dep:
|
||||||
cand_deps_per_dep[token.dep_.lower()] = set()
|
cand_deps_per_dep[token.dep_.lower()] = set()
|
||||||
cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower()))
|
cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower()))
|
||||||
if "-" not in [token[-1] for token in gold.orig_annot]:
|
if "-" not in orig.entities:
|
||||||
# Find all NER labels in gold and doc
|
# Find all NER labels in gold and doc
|
||||||
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
|
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
|
||||||
# Set up all labels for per type scoring and prepare gold per type
|
# Set up all labels for per type scoring and prepare gold per type
|
||||||
|
@ -336,7 +342,7 @@ class Scorer(object):
|
||||||
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
|
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
|
||||||
)
|
)
|
||||||
if verbose:
|
if verbose:
|
||||||
gold_words = [item[1] for item in gold.orig_annot]
|
gold_words = orig.words
|
||||||
for w_id, h_id, dep in cand_deps - gold_deps:
|
for w_id, h_id, dep in cand_deps - gold_deps:
|
||||||
print("F", gold_words[w_id], dep, gold_words[h_id])
|
print("F", gold_words[w_id], dep, gold_words[h_id])
|
||||||
for w_id, h_id, dep in gold_deps - cand_deps:
|
for w_id, h_id, dep in gold_deps - cand_deps:
|
||||||
|
|
|
@ -341,10 +341,10 @@ cdef class ArcEager(TransitionSystem):
|
||||||
for label in kwargs.get('right_labels', []):
|
for label in kwargs.get('right_labels', []):
|
||||||
actions[RIGHT][label] = 1
|
actions[RIGHT][label] = 1
|
||||||
actions[REDUCE][label] = 1
|
actions[REDUCE][label] = 1
|
||||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
for example in kwargs.get('gold_parses', []):
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
for token_annotation in example.token_annotations:
|
||||||
heads, labels = nonproj.projectivize(heads, labels)
|
heads, labels = nonproj.projectivize(token_annotation.heads, token_annotation.deps)
|
||||||
for child, head, label in zip(ids, heads, labels):
|
for child, head, label in zip(token_annotation.ids, heads, labels):
|
||||||
if label.upper() == 'ROOT' :
|
if label.upper() == 'ROOT' :
|
||||||
label = 'ROOT'
|
label = 'ROOT'
|
||||||
if head == child:
|
if head == child:
|
||||||
|
@ -397,7 +397,9 @@ cdef class ArcEager(TransitionSystem):
|
||||||
self.strings[state.safe_get(i).dep]))
|
self.strings[state.safe_get(i).dep]))
|
||||||
else:
|
else:
|
||||||
predicted.add((i, state.H(i), 'ROOT'))
|
predicted.add((i, state.H(i), 'ROOT'))
|
||||||
id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
|
id_ = gold.orig.ids[gold.cand_to_gold[i]]
|
||||||
|
head = gold.orig.heads[gold.cand_to_gold[i]]
|
||||||
|
dep = gold.orig.deps[gold.cand_to_gold[i]]
|
||||||
truth.add((id_, head, dep))
|
truth.add((id_, head, dep))
|
||||||
return truth == predicted
|
return truth == predicted
|
||||||
|
|
||||||
|
|
|
@ -72,9 +72,9 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
for action in (BEGIN, IN, LAST, UNIT):
|
for action in (BEGIN, IN, LAST, UNIT):
|
||||||
actions[action][entity_type] = 1
|
actions[action][entity_type] = 1
|
||||||
moves = ('M', 'B', 'I', 'L', 'U')
|
moves = ('M', 'B', 'I', 'L', 'U')
|
||||||
for raw_text, sents in kwargs.get('gold_parses', []):
|
for example in kwargs.get('gold_parses', []):
|
||||||
for (ids, words, tags, heads, labels, biluo), _ in sents:
|
for token_annotation in example.token_annotations:
|
||||||
for i, ner_tag in enumerate(biluo):
|
for i, ner_tag in enumerate(token_annotation.entities):
|
||||||
if ner_tag != 'O' and ner_tag != '-':
|
if ner_tag != 'O' and ner_tag != '-':
|
||||||
_, label = ner_tag.split('-', 1)
|
_, label = ner_tag.split('-', 1)
|
||||||
for action in (BEGIN, IN, LAST, UNIT):
|
for action in (BEGIN, IN, LAST, UNIT):
|
||||||
|
|
|
@ -27,6 +27,7 @@ from thinc.neural.util import get_array_module
|
||||||
from thinc.linalg cimport Vec, VecVec
|
from thinc.linalg cimport Vec, VecVec
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
|
from spacy.gold import Example
|
||||||
from ._parser_model cimport alloc_activations, free_activations
|
from ._parser_model cimport alloc_activations, free_activations
|
||||||
from ._parser_model cimport predict_states, arg_max_if_valid
|
from ._parser_model cimport predict_states, arg_max_if_valid
|
||||||
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||||
|
@ -193,7 +194,7 @@ cdef class Parser:
|
||||||
# Defined in subclasses, to avoid circular import
|
# Defined in subclasses, to avoid circular import
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_gold_tuples, pipeline, **cfg):
|
def init_multitask_objectives(self, get_examples, pipeline, **cfg):
|
||||||
'''Setup models for secondary objectives, to benefit from multi-task
|
'''Setup models for secondary objectives, to benefit from multi-task
|
||||||
learning. This method is intended to be overridden by subclasses.
|
learning. This method is intended to be overridden by subclasses.
|
||||||
|
|
||||||
|
@ -203,9 +204,9 @@ cdef class Parser:
|
||||||
'''
|
'''
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def preprocess_gold(self, docs_golds):
|
def preprocess_gold(self, examples):
|
||||||
for doc, gold in docs_golds:
|
for ex in examples:
|
||||||
yield doc, gold
|
yield ex
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
# Can't decorate cdef class :(. Workaround.
|
# Can't decorate cdef class :(. Workaround.
|
||||||
|
@ -411,35 +412,31 @@ cdef class Parser:
|
||||||
beam.check_done(_beam_utils.check_final_state, NULL)
|
beam.check_done(_beam_utils.check_final_state, NULL)
|
||||||
return [b for b in beams if not b.is_done]
|
return [b for b in beams if not b.is_done]
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, examples, drop=0., sgd=None, losses=None):
|
||||||
self.require_model()
|
self.require_model()
|
||||||
if isinstance(docs, Doc) and isinstance(golds, GoldParse):
|
examples = Example.to_example_objects(examples)
|
||||||
docs = [docs]
|
|
||||||
golds = [golds]
|
|
||||||
if len(docs) != len(golds):
|
|
||||||
raise ValueError(Errors.E077.format(value='update', n_docs=len(docs),
|
|
||||||
n_golds=len(golds)))
|
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
for multitask in self._multitasks:
|
for multitask in self._multitasks:
|
||||||
multitask.update(docs, golds, drop=drop, sgd=sgd)
|
multitask.update(examples, drop=drop, sgd=sgd)
|
||||||
# The probability we use beam update, instead of falling back to
|
# The probability we use beam update, instead of falling back to
|
||||||
# a greedy update
|
# a greedy update
|
||||||
beam_update_prob = self.cfg.get('beam_update_prob', 0.5)
|
beam_update_prob = self.cfg.get('beam_update_prob', 0.5)
|
||||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob:
|
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() < beam_update_prob:
|
||||||
return self.update_beam(docs, golds, self.cfg.get('beam_width', 1),
|
return self.update_beam(examples, self.cfg.get('beam_width', 1),
|
||||||
drop=drop, sgd=sgd, losses=losses,
|
drop=drop, sgd=sgd, losses=losses,
|
||||||
beam_density=self.cfg.get('beam_density', 0.001))
|
beam_density=self.cfg.get('beam_density', 0.001))
|
||||||
# Chop sequences into lengths of this many transitions, to make the
|
# Chop sequences into lengths of this many transitions, to make the
|
||||||
# batch uniform length.
|
# batch uniform length.
|
||||||
cut_gold = numpy.random.choice(range(20, 100))
|
cut_gold = numpy.random.choice(range(20, 100))
|
||||||
states, golds, max_steps = self._init_gold_batch(docs, golds, max_length=cut_gold)
|
states, golds, max_steps = self._init_gold_batch(examples, max_length=cut_gold)
|
||||||
states_golds = [(s, g) for (s, g) in zip(states, golds)
|
states_golds = [(s, g) for (s, g) in zip(states, golds)
|
||||||
if not s.is_final() and g is not None]
|
if not s.is_final() and g is not None]
|
||||||
|
|
||||||
# Prepare the stepwise model, and get the callback for finishing the batch
|
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||||
model, finish_update = self.model.begin_update(docs, drop=drop)
|
model, finish_update = self.model.begin_update([ex.doc for ex in examples], drop=drop)
|
||||||
for _ in range(max_steps):
|
for _ in range(max_steps):
|
||||||
if not states_golds:
|
if not states_golds:
|
||||||
break
|
break
|
||||||
|
@ -454,19 +451,19 @@ cdef class Parser:
|
||||||
finish_update(golds, sgd=sgd)
|
finish_update(golds, sgd=sgd)
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def rehearse(self, docs, sgd=None, losses=None, **cfg):
|
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
||||||
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
|
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
|
||||||
if isinstance(docs, Doc):
|
examples = Example.to_example_objects(examples)
|
||||||
docs = [docs]
|
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
for multitask in self._multitasks:
|
for multitask in self._multitasks:
|
||||||
if hasattr(multitask, 'rehearse'):
|
if hasattr(multitask, 'rehearse'):
|
||||||
multitask.rehearse(docs, losses=losses, sgd=sgd)
|
multitask.rehearse(examples, losses=losses, sgd=sgd)
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return None
|
return None
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.)
|
||||||
|
|
||||||
|
docs = [ex.doc for ex in examples]
|
||||||
states = self.moves.init_batch(docs)
|
states = self.moves.init_batch(docs)
|
||||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||||
# if labels are missing. We therefore have to check whether we need to
|
# if labels are missing. We therefore have to check whether we need to
|
||||||
|
@ -494,15 +491,20 @@ cdef class Parser:
|
||||||
losses[self.name] += loss / n_scores
|
losses[self.name] += loss / n_scores
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def update_beam(self, docs, golds, width, drop=0., sgd=None, losses=None,
|
def update_beam(self, examples, width, drop=0., sgd=None, losses=None,
|
||||||
beam_density=0.0):
|
beam_density=0.0):
|
||||||
|
examples = Example.to_example_objects(examples)
|
||||||
|
docs = [ex.doc for ex in examples]
|
||||||
|
golds = [ex.gold for ex in examples]
|
||||||
|
new_golds = []
|
||||||
lengths = [len(d) for d in docs]
|
lengths = [len(d) for d in docs]
|
||||||
states = self.moves.init_batch(docs)
|
states = self.moves.init_batch(docs)
|
||||||
for gold in golds:
|
for gold in golds:
|
||||||
self.moves.preprocess_gold(gold)
|
self.moves.preprocess_gold(gold)
|
||||||
|
new_golds.append(gold)
|
||||||
model, finish_update = self.model.begin_update(docs, drop=drop)
|
model, finish_update = self.model.begin_update(docs, drop=drop)
|
||||||
states_d_scores, backprops, beams = _beam_utils.update_beam(
|
states_d_scores, backprops, beams = _beam_utils.update_beam(
|
||||||
self.moves, self.nr_feature, 10000, states, golds, model.state2vec,
|
self.moves, self.nr_feature, 10000, states, new_golds, model.state2vec,
|
||||||
model.vec2scores, width, drop=drop, losses=losses,
|
model.vec2scores, width, drop=drop, losses=losses,
|
||||||
beam_density=beam_density)
|
beam_density=beam_density)
|
||||||
for i, d_scores in enumerate(states_d_scores):
|
for i, d_scores in enumerate(states_d_scores):
|
||||||
|
@ -522,7 +524,7 @@ cdef class Parser:
|
||||||
for beam in beams:
|
for beam in beams:
|
||||||
_beam_utils.cleanup_beam(beam)
|
_beam_utils.cleanup_beam(beam)
|
||||||
|
|
||||||
def _init_gold_batch(self, whole_docs, whole_golds, min_length=5, max_length=500):
|
def _init_gold_batch(self, whole_examples, min_length=5, max_length=500):
|
||||||
"""Make a square batch, of length equal to the shortest doc. A long
|
"""Make a square batch, of length equal to the shortest doc. A long
|
||||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||||
where N is the shortest doc. We'll make two states, one representing
|
where N is the shortest doc. We'll make two states, one representing
|
||||||
|
@ -530,6 +532,8 @@ cdef class Parser:
|
||||||
cdef:
|
cdef:
|
||||||
StateClass state
|
StateClass state
|
||||||
Transition action
|
Transition action
|
||||||
|
whole_docs = [ex.doc for ex in whole_examples]
|
||||||
|
whole_golds = [ex.gold for ex in whole_examples]
|
||||||
whole_states = self.moves.init_batch(whole_docs)
|
whole_states = self.moves.init_batch(whole_docs)
|
||||||
max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
|
max_length = max(min_length, min(max_length, min([len(doc) for doc in whole_docs])))
|
||||||
max_moves = 0
|
max_moves = 0
|
||||||
|
@ -592,14 +596,14 @@ cdef class Parser:
|
||||||
return create_default_optimizer(self.model.ops,
|
return create_default_optimizer(self.model.ops,
|
||||||
**self.cfg.get('optimizer', {}))
|
**self.cfg.get('optimizer', {}))
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg):
|
def begin_training(self, get_examples, pipeline=None, sgd=None, **cfg):
|
||||||
if 'model' in cfg:
|
if 'model' in cfg:
|
||||||
self.model = cfg['model']
|
self.model = cfg['model']
|
||||||
if not hasattr(get_gold_tuples, '__call__'):
|
if not hasattr(get_examples, '__call__'):
|
||||||
gold_tuples = get_gold_tuples
|
gold_tuples = get_examples
|
||||||
get_gold_tuples = lambda: gold_tuples
|
get_examples = lambda: gold_tuples
|
||||||
cfg.setdefault('min_action_freq', 30)
|
cfg.setdefault('min_action_freq', 30)
|
||||||
actions = self.moves.get_actions(gold_parses=get_gold_tuples(),
|
actions = self.moves.get_actions(gold_parses=get_examples(),
|
||||||
min_freq=cfg.get('min_action_freq', 30),
|
min_freq=cfg.get('min_action_freq', 30),
|
||||||
learn_tokens=self.cfg.get("learn_tokens", False))
|
learn_tokens=self.cfg.get("learn_tokens", False))
|
||||||
for action, labels in self.moves.labels.items():
|
for action, labels in self.moves.labels.items():
|
||||||
|
@ -615,15 +619,14 @@ cdef class Parser:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
gold_sample = []
|
gold_sample = []
|
||||||
for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
|
for example in islice(get_examples(), 1000):
|
||||||
for annots, brackets in annots_brackets:
|
parses = example.get_gold_parses(merge=False, vocab=self.vocab)
|
||||||
ids, words, tags, heads, deps, ents = annots
|
for doc, gold in parses:
|
||||||
doc_sample.append(Doc(self.vocab, words=words))
|
doc_sample.append(doc)
|
||||||
gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags,
|
gold_sample.append(gold)
|
||||||
heads=heads, deps=deps, entities=ents))
|
|
||||||
self.model.begin_training(doc_sample, gold_sample)
|
self.model.begin_training(doc_sample, gold_sample)
|
||||||
if pipeline is not None:
|
if pipeline is not None:
|
||||||
self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
|
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **cfg)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
else:
|
else:
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
|
|
|
@ -9,6 +9,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from copy import copy
|
from copy import copy
|
||||||
|
|
||||||
|
from spacy.gold import Example
|
||||||
from ..tokens.doc cimport Doc, set_children_from_heads
|
from ..tokens.doc cimport Doc, set_children_from_heads
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
@ -77,39 +78,42 @@ def decompose(label):
|
||||||
def is_decorated(label):
|
def is_decorated(label):
|
||||||
return DELIMITER in label
|
return DELIMITER in label
|
||||||
|
|
||||||
def count_decorated_labels(gold_tuples):
|
def count_decorated_labels(gold_data):
|
||||||
freqs = {}
|
freqs = {}
|
||||||
for raw_text, sents in gold_tuples:
|
for example in gold_data:
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
for token_annotation in example.token_annotations:
|
||||||
proj_heads, deco_labels = projectivize(heads, labels)
|
proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
|
||||||
# set the label to ROOT for each root dependent
|
# set the label to ROOT for each root dependent
|
||||||
deco_labels = ['ROOT' if head == i else deco_labels[i]
|
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
||||||
for i, head in enumerate(proj_heads)]
|
for i, head in enumerate(proj_heads)]
|
||||||
# count label frequencies
|
# count label frequencies
|
||||||
for label in deco_labels:
|
for label in deco_deps:
|
||||||
if is_decorated(label):
|
if is_decorated(label):
|
||||||
freqs[label] = freqs.get(label, 0) + 1
|
freqs[label] = freqs.get(label, 0) + 1
|
||||||
return freqs
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
|
def preprocess_training_data(gold_data, label_freq_cutoff=30):
|
||||||
preprocessed = []
|
preprocessed = []
|
||||||
freqs = {}
|
freqs = {}
|
||||||
for raw_text, sents in gold_tuples:
|
for example in gold_data:
|
||||||
prepro_sents = []
|
new_example = Example(doc=example.doc)
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
for token_annotation in example.token_annotations:
|
||||||
proj_heads, deco_labels = projectivize(heads, labels)
|
proj_heads, deco_deps = projectivize(token_annotation.heads, token_annotation.deps)
|
||||||
# set the label to ROOT for each root dependent
|
# set the label to ROOT for each root dependent
|
||||||
deco_labels = ['ROOT' if head == i else deco_labels[i]
|
deco_deps = ['ROOT' if head == i else deco_deps[i]
|
||||||
for i, head in enumerate(proj_heads)]
|
for i, head in enumerate(proj_heads)]
|
||||||
# count label frequencies
|
# count label frequencies
|
||||||
if label_freq_cutoff > 0:
|
if label_freq_cutoff > 0:
|
||||||
for label in deco_labels:
|
for label in deco_deps:
|
||||||
if is_decorated(label):
|
if is_decorated(label):
|
||||||
freqs[label] = freqs.get(label, 0) + 1
|
freqs[label] = freqs.get(label, 0) + 1
|
||||||
prepro_sents.append(
|
# TODO: the code would be less ugly when changing heads and deps in-place, but is this OK upstream ?
|
||||||
((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
|
proj_token_dict = token_annotation.to_dict()
|
||||||
preprocessed.append((raw_text, prepro_sents))
|
proj_token_dict["heads"] = proj_heads
|
||||||
|
proj_token_dict["deps"] = deco_deps
|
||||||
|
new_example.add_token_annotation(**proj_token_dict)
|
||||||
|
preprocessed.append(new_example)
|
||||||
if label_freq_cutoff > 0:
|
if label_freq_cutoff > 0:
|
||||||
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
|
return _filter_labels(preprocessed, label_freq_cutoff, freqs)
|
||||||
return preprocessed
|
return preprocessed
|
||||||
|
@ -203,20 +207,21 @@ def _find_new_head(token, headlabel):
|
||||||
return token.head
|
return token.head
|
||||||
|
|
||||||
|
|
||||||
def _filter_labels(gold_tuples, cutoff, freqs):
|
def _filter_labels(examples, cutoff, freqs):
|
||||||
# throw away infrequent decorated labels
|
# throw away infrequent decorated labels
|
||||||
# can't learn them reliably anyway and keeps label set smaller
|
# can't learn them reliably anyway and keeps label set smaller
|
||||||
filtered = []
|
filtered = []
|
||||||
for raw_text, sents in gold_tuples:
|
for example in examples:
|
||||||
filtered_sents = []
|
new_example = Example(doc=example.doc)
|
||||||
for (ids, words, tags, heads, labels, iob), ctnts in sents:
|
for token_annotation in example.token_annotations:
|
||||||
filtered_labels = []
|
filtered_labels = []
|
||||||
for label in labels:
|
for label in token_annotation.deps:
|
||||||
if is_decorated(label) and freqs.get(label, 0) < cutoff:
|
if is_decorated(label) and freqs.get(label, 0) < cutoff:
|
||||||
filtered_labels.append(decompose(label)[0])
|
filtered_labels.append(decompose(label)[0])
|
||||||
else:
|
else:
|
||||||
filtered_labels.append(label)
|
filtered_labels.append(label)
|
||||||
filtered_sents.append(
|
filtered_token_dict = token_annotation.to_dict()
|
||||||
((ids, words, tags, heads, filtered_labels, iob), ctnts))
|
filtered_token_dict["deps"] = filtered_labels
|
||||||
filtered.append((raw_text, filtered_sents))
|
new_example.add_token_annotation(**filtered_token_dict)
|
||||||
|
filtered.append(new_example)
|
||||||
return filtered
|
return filtered
|
||||||
|
|
|
@ -37,7 +37,7 @@ def _train_parser(parser):
|
||||||
losses = {}
|
losses = {}
|
||||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
|
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
|
||||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
parser.update((doc, gold), sgd=sgd, losses=losses)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ def test_add_label(parser):
|
||||||
gold = GoldParse(
|
gold = GoldParse(
|
||||||
doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"]
|
doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"]
|
||||||
)
|
)
|
||||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
parser.update((doc, gold), sgd=sgd, losses=losses)
|
||||||
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
assert doc[0].dep_ == "right"
|
assert doc[0].dep_ == "right"
|
||||||
|
|
|
@ -130,18 +130,25 @@ annot_tuples = [
|
||||||
|
|
||||||
|
|
||||||
def test_get_oracle_actions():
|
def test_get_oracle_actions():
|
||||||
|
ids, words, tags, heads, deps, ents = [], [], [], [], [], []
|
||||||
|
for id_, word, tag, head, dep, ent in annot_tuples:
|
||||||
|
ids.append(id_)
|
||||||
|
words.append(word)
|
||||||
|
tags.append(tag)
|
||||||
|
heads.append(head)
|
||||||
|
deps.append(dep)
|
||||||
|
ents.append(ent)
|
||||||
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
doc = Doc(Vocab(), words=[t[1] for t in annot_tuples])
|
||||||
parser = DependencyParser(doc.vocab)
|
parser = DependencyParser(doc.vocab)
|
||||||
parser.moves.add_action(0, "")
|
parser.moves.add_action(0, "")
|
||||||
parser.moves.add_action(1, "")
|
parser.moves.add_action(1, "")
|
||||||
parser.moves.add_action(1, "")
|
parser.moves.add_action(1, "")
|
||||||
parser.moves.add_action(4, "ROOT")
|
parser.moves.add_action(4, "ROOT")
|
||||||
for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples):
|
for i, (head, dep) in enumerate(zip(heads, deps)):
|
||||||
if head > i:
|
if head > i:
|
||||||
parser.moves.add_action(2, dep)
|
parser.moves.add_action(2, dep)
|
||||||
elif head < i:
|
elif head < i:
|
||||||
parser.moves.add_action(3, dep)
|
parser.moves.add_action(3, dep)
|
||||||
ids, words, tags, heads, deps, ents = zip(*annot_tuples)
|
|
||||||
heads, deps = projectivize(heads, deps)
|
heads, deps = projectivize(heads, deps)
|
||||||
gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
|
gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps)
|
||||||
parser.moves.preprocess_gold(gold)
|
parser.moves.preprocess_gold(gold)
|
||||||
|
|
|
@ -67,7 +67,7 @@ def test_update_doc(parser, model, doc, gold):
|
||||||
def optimize(weights, gradient, key=None):
|
def optimize(weights, gradient, key=None):
|
||||||
weights -= 0.001 * gradient
|
weights -= 0.001 * gradient
|
||||||
|
|
||||||
parser.update([doc], [gold], sgd=optimize)
|
parser.update((doc, gold), sgd=optimize)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
@pytest.mark.xfail
|
||||||
|
@ -83,4 +83,4 @@ def test_update_doc_beam(parser, model, doc, gold):
|
||||||
def optimize(weights, gradient, key=None):
|
def optimize(weights, gradient, key=None):
|
||||||
weights -= 0.001 * gradient
|
weights -= 0.001 * gradient
|
||||||
|
|
||||||
parser.update_beam([doc], [gold], sgd=optimize)
|
parser.update_beam((doc, gold), sgd=optimize)
|
||||||
|
|
|
@ -30,7 +30,7 @@ def parser(vocab):
|
||||||
losses = {}
|
losses = {}
|
||||||
doc = Doc(vocab, words=["a", "b", "c", "d"])
|
doc = Doc(vocab, words=["a", "b", "c", "d"])
|
||||||
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
|
gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
|
||||||
parser.update([doc], [gold], sgd=sgd, losses=losses)
|
parser.update((doc, gold), sgd=sgd, losses=losses)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ def test_simple_train():
|
||||||
("bbbbbbbbb", 0.0),
|
("bbbbbbbbb", 0.0),
|
||||||
("aaaaaa", 1),
|
("aaaaaa", 1),
|
||||||
]:
|
]:
|
||||||
nlp.update([text], [{"cats": {"answer": answer}}])
|
nlp.update((text, {"cats": {"answer": answer}}))
|
||||||
doc = nlp("aaa")
|
doc = nlp("aaa")
|
||||||
assert "answer" in doc.cats
|
assert "answer" in doc.cats
|
||||||
assert doc.cats["answer"] >= 0.5
|
assert doc.cats["answer"] >= 0.5
|
||||||
|
|
|
@ -451,7 +451,7 @@ def test_issue999(train_data):
|
||||||
for itn in range(100):
|
for itn in range(100):
|
||||||
random.shuffle(TRAIN_DATA)
|
random.shuffle(TRAIN_DATA)
|
||||||
for raw_text, entity_offsets in TRAIN_DATA:
|
for raw_text, entity_offsets in TRAIN_DATA:
|
||||||
nlp.update([raw_text], [{"entities": entity_offsets}])
|
nlp.update((raw_text, {"entities": entity_offsets}))
|
||||||
|
|
||||||
with make_tempdir() as model_dir:
|
with make_tempdir() as model_dir:
|
||||||
nlp.to_disk(model_dir)
|
nlp.to_disk(model_dir)
|
||||||
|
|
|
@ -5,6 +5,8 @@ import pytest
|
||||||
import gc
|
import gc
|
||||||
import numpy
|
import numpy
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
|
from spacy.gold import Example
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.lang.en.stop_words import STOP_WORDS
|
from spacy.lang.en.stop_words import STOP_WORDS
|
||||||
from spacy.lang.lex_attrs import is_stop
|
from spacy.lang.lex_attrs import is_stop
|
||||||
|
@ -270,9 +272,9 @@ def test_issue1963(en_tokenizer):
|
||||||
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
@pytest.mark.parametrize("label", ["U-JOB-NAME"])
|
||||||
def test_issue1967(label):
|
def test_issue1967(label):
|
||||||
ner = EntityRecognizer(Vocab())
|
ner = EntityRecognizer(Vocab())
|
||||||
entry = ([0], ["word"], ["tag"], [0], ["dep"], [label])
|
example = Example(doc=None)
|
||||||
gold_parses = [(None, [(entry, None)])]
|
example.add_token_annotation(ids=[0], words=["word"], tags=["tag"], heads=[0], deps=["dep"], entities=[label])
|
||||||
ner.moves.get_actions(gold_parses=gold_parses)
|
ner.moves.get_actions(gold_parses=[example])
|
||||||
|
|
||||||
|
|
||||||
def test_issue1971(en_vocab):
|
def test_issue1971(en_vocab):
|
||||||
|
|
|
@ -157,7 +157,7 @@ def test_issue2800():
|
||||||
losses = {}
|
losses = {}
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
for statement, entities in train_data:
|
for statement, entities in train_data:
|
||||||
nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5)
|
nlp.update((statement, entities), sgd=optimizer, losses=losses, drop=0.5)
|
||||||
|
|
||||||
|
|
||||||
def test_issue2822(it_tokenizer):
|
def test_issue2822(it_tokenizer):
|
||||||
|
|
|
@ -41,10 +41,8 @@ def test_issue3611():
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(
|
nlp.update(
|
||||||
docs=texts,
|
examples=batch,
|
||||||
golds=annotations,
|
|
||||||
sgd=optimizer,
|
sgd=optimizer,
|
||||||
drop=0.1,
|
drop=0.1,
|
||||||
losses=losses,
|
losses=losses,
|
||||||
|
|
|
@ -41,10 +41,8 @@ def test_issue4030():
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
|
||||||
nlp.update(
|
nlp.update(
|
||||||
docs=texts,
|
examples=batch,
|
||||||
golds=annotations,
|
|
||||||
sgd=optimizer,
|
sgd=optimizer,
|
||||||
drop=0.1,
|
drop=0.1,
|
||||||
losses=losses,
|
losses=losses,
|
||||||
|
|
|
@ -19,5 +19,4 @@ def test_issue4348():
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
texts, annotations = zip(*batch)
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
nlp.update(texts, annotations, sgd=optimizer, losses=losses)
|
|
||||||
|
|
|
@ -11,15 +11,14 @@ from spacy.tests.util import make_tempdir
|
||||||
def test_issue4402():
|
def test_issue4402():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
print("temp", tmpdir)
|
|
||||||
json_path = tmpdir / "test4402.json"
|
json_path = tmpdir / "test4402.json"
|
||||||
srsly.write_json(json_path, json_data)
|
srsly.write_json(json_path, json_data)
|
||||||
|
|
||||||
corpus = GoldCorpus(str(json_path), str(json_path))
|
corpus = GoldCorpus(str(json_path), str(json_path))
|
||||||
|
|
||||||
train_docs = list(corpus.train_docs(nlp, gold_preproc=True, max_length=0))
|
train_data = list(corpus.train_dataset(nlp, gold_preproc=True, max_length=0))
|
||||||
# assert that the data got split into 4 sentences
|
# assert that the data got split into 4 sentences
|
||||||
assert len(train_docs) == 4
|
assert len(train_data) == 4
|
||||||
|
|
||||||
|
|
||||||
json_data = [
|
json_data = [
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
|
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation
|
||||||
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
|
from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
|
||||||
from spacy.gold import GoldCorpus, docs_to_json, align
|
from spacy.gold import GoldCorpus, docs_to_json, align
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
from spacy.util import compounding, minibatch
|
||||||
from .util import make_tempdir
|
from .util import make_tempdir
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -119,12 +120,13 @@ def test_roundtrip_docs_to_json():
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
json_file = tmpdir / "roundtrip.json"
|
json_file = tmpdir / "roundtrip.json"
|
||||||
srsly.write_json(json_file, [docs_to_json(doc)])
|
srsly.write_json(json_file, [docs_to_json(doc)])
|
||||||
goldcorpus = GoldCorpus(str(json_file), str(json_file))
|
goldcorpus = GoldCorpus(train=str(json_file), dev=str(json_file))
|
||||||
|
|
||||||
reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
|
reloaded_example = next(goldcorpus.train_dataset(nlp))
|
||||||
|
goldparse = reloaded_example.gold
|
||||||
|
|
||||||
assert len(doc) == goldcorpus.count_train()
|
assert len(doc) == goldcorpus.count_train()
|
||||||
assert text == reloaded_doc.text
|
assert text == reloaded_example.text
|
||||||
assert tags == goldparse.tags
|
assert tags == goldparse.tags
|
||||||
assert deps == goldparse.labels
|
assert deps == goldparse.labels
|
||||||
assert heads == goldparse.heads
|
assert heads == goldparse.heads
|
||||||
|
@ -140,10 +142,11 @@ def test_roundtrip_docs_to_json():
|
||||||
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
|
||||||
reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
|
reloaded_example = next(goldcorpus.train_dataset(nlp))
|
||||||
|
goldparse = reloaded_example.gold
|
||||||
|
|
||||||
assert len(doc) == goldcorpus.count_train()
|
assert len(doc) == goldcorpus.count_train()
|
||||||
assert text == reloaded_doc.text
|
assert text == reloaded_example.text
|
||||||
assert tags == goldparse.tags
|
assert tags == goldparse.tags
|
||||||
assert deps == goldparse.labels
|
assert deps == goldparse.labels
|
||||||
assert heads == goldparse.heads
|
assert heads == goldparse.heads
|
||||||
|
@ -160,13 +163,14 @@ def test_roundtrip_docs_to_json():
|
||||||
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
# load and rewrite as JSONL tuples
|
# load and rewrite as JSONL tuples
|
||||||
srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples)
|
srsly.write_jsonl(jsonl_file, goldcorpus.train_examples)
|
||||||
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
|
||||||
|
|
||||||
reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
|
reloaded_example = next(goldcorpus.train_dataset(nlp))
|
||||||
|
goldparse = reloaded_example.gold
|
||||||
|
|
||||||
assert len(doc) == goldcorpus.count_train()
|
assert len(doc) == goldcorpus.count_train()
|
||||||
assert text == reloaded_doc.text
|
assert text == reloaded_example.text
|
||||||
assert tags == goldparse.tags
|
assert tags == goldparse.tags
|
||||||
assert deps == goldparse.labels
|
assert deps == goldparse.labels
|
||||||
assert heads == goldparse.heads
|
assert heads == goldparse.heads
|
||||||
|
@ -217,3 +221,144 @@ def test_goldparse_startswith_space(en_tokenizer):
|
||||||
assert g.words == [" ", "a"]
|
assert g.words == [" ", "a"]
|
||||||
assert g.ner == [None, "U-DATE"]
|
assert g.ner == [None, "U-DATE"]
|
||||||
assert g.labels == [None, "ROOT"]
|
assert g.labels == [None, "ROOT"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_gold_constructor():
|
||||||
|
"""Test that the GoldParse constructor works fine"""
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp("This is a sentence")
|
||||||
|
gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0})
|
||||||
|
|
||||||
|
assert gold.cats["cat1"]
|
||||||
|
assert not gold.cats["cat2"]
|
||||||
|
assert gold.words == ["This", "is", "a", "sentence"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_gold_orig_annot():
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp("This is a sentence")
|
||||||
|
gold = GoldParse(doc, cats={"cat1": 1.0, "cat2": 0.0})
|
||||||
|
|
||||||
|
assert gold.orig.words == ["This", "is", "a", "sentence"]
|
||||||
|
assert gold.cats["cat1"]
|
||||||
|
|
||||||
|
doc_annotation = DocAnnotation(cats={"cat1": 0.0, "cat2": 1.0})
|
||||||
|
gold2 = GoldParse.from_annotation(doc, doc_annotation, gold.orig)
|
||||||
|
assert gold2.orig.words == ["This", "is", "a", "sentence"]
|
||||||
|
assert not gold2.cats["cat1"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_tuple_format_implicit():
|
||||||
|
"""Test tuple format with implicit GoldParse creation"""
|
||||||
|
|
||||||
|
train_data = [
|
||||||
|
("Uber blew through $1 million a week", {"entities": [(0, 4, "ORG")]}),
|
||||||
|
(
|
||||||
|
"Spotify steps up Asia expansion",
|
||||||
|
{"entities": [(0, 8, "ORG"), (17, 21, "LOC")]},
|
||||||
|
),
|
||||||
|
("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
|
||||||
|
]
|
||||||
|
|
||||||
|
_train(train_data)
|
||||||
|
|
||||||
|
|
||||||
|
def test_tuple_format_implicit_invalid():
|
||||||
|
"""Test that an error is thrown for an implicit invalid GoldParse field"""
|
||||||
|
|
||||||
|
train_data = [
|
||||||
|
("Uber blew through $1 million a week", {"frumble": [(0, 4, "ORG")]}),
|
||||||
|
(
|
||||||
|
"Spotify steps up Asia expansion",
|
||||||
|
{"entities": [(0, 8, "ORG"), (17, 21, "LOC")]},
|
||||||
|
),
|
||||||
|
("Google rebrands its business apps", {"entities": [(0, 6, "ORG")]}),
|
||||||
|
]
|
||||||
|
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
_train(train_data)
|
||||||
|
|
||||||
|
|
||||||
|
def _train(train_data):
|
||||||
|
nlp = English()
|
||||||
|
ner = nlp.create_pipe("ner")
|
||||||
|
ner.add_label("ORG")
|
||||||
|
ner.add_label("LOC")
|
||||||
|
nlp.add_pipe(ner)
|
||||||
|
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
for i in range(5):
|
||||||
|
losses = {}
|
||||||
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
for batch in batches:
|
||||||
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
|
|
||||||
|
tokens_1 = {
|
||||||
|
"ids": [1, 2, 3],
|
||||||
|
"words": ["Hi", "there", "everyone"],
|
||||||
|
"tags": ["INTJ", "ADV", "PRON"],
|
||||||
|
}
|
||||||
|
|
||||||
|
tokens_2 = {
|
||||||
|
"ids": [1, 2, 3, 4],
|
||||||
|
"words": ["It", "is", "just", "me"],
|
||||||
|
"tags": ["PRON", "AUX", "ADV", "PRON"],
|
||||||
|
}
|
||||||
|
|
||||||
|
text0 = "Hi there everyone It is just me"
|
||||||
|
|
||||||
|
|
||||||
|
def test_merge_sents():
|
||||||
|
nlp = English()
|
||||||
|
example = Example()
|
||||||
|
example.add_token_annotation(**tokens_1)
|
||||||
|
example.add_token_annotation(**tokens_2)
|
||||||
|
assert len(example.get_gold_parses(merge=False, vocab=nlp.vocab)) == 2
|
||||||
|
assert len(example.get_gold_parses(merge=True, vocab=nlp.vocab)) == 1 # this shouldn't change the original object
|
||||||
|
|
||||||
|
merged_example = example.merge_sents()
|
||||||
|
|
||||||
|
token_annotation_1 = example.token_annotations[0]
|
||||||
|
assert token_annotation_1.ids == [1, 2, 3]
|
||||||
|
assert token_annotation_1.words == ["Hi", "there", "everyone"]
|
||||||
|
assert token_annotation_1.tags == ["INTJ", "ADV", "PRON"]
|
||||||
|
|
||||||
|
token_annotation_m = merged_example.token_annotations[0]
|
||||||
|
assert token_annotation_m.ids == [1, 2, 3, 4, 5, 6, 7]
|
||||||
|
assert token_annotation_m.words == ["Hi", "there", "everyone", "It", "is", "just", "me"]
|
||||||
|
assert token_annotation_m.tags == ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_tuples_to_example():
|
||||||
|
ex = Example()
|
||||||
|
ex.add_token_annotation(**tokens_1)
|
||||||
|
ex.add_token_annotation(**tokens_2)
|
||||||
|
ex.add_doc_annotation(cats={"TRAVEL": 1.0, "BAKING": 0.0})
|
||||||
|
ex_dict = ex.to_dict()
|
||||||
|
|
||||||
|
token_dicts = [
|
||||||
|
{
|
||||||
|
"ids": [1, 2, 3],
|
||||||
|
"words": ["Hi", "there", "everyone"],
|
||||||
|
"tags": ["INTJ", "ADV", "PRON"],
|
||||||
|
"heads": [],
|
||||||
|
"deps": [],
|
||||||
|
"entities": [],
|
||||||
|
"morphology": [],
|
||||||
|
"brackets": [],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ids": [1, 2, 3, 4],
|
||||||
|
"words": ["It", "is", "just", "me"],
|
||||||
|
"tags": ["PRON", "AUX", "ADV", "PRON"],
|
||||||
|
"heads": [],
|
||||||
|
"deps": [],
|
||||||
|
"entities": [],
|
||||||
|
"morphology": [],
|
||||||
|
"brackets": [],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
doc_dict = {"cats": {"TRAVEL": 1.0, "BAKING": 0.0}, "links": {}}
|
||||||
|
|
||||||
|
assert ex_dict == {"token_annotations": token_dicts, "doc_annotation": doc_dict}
|
||||||
|
|
|
@ -31,20 +31,20 @@ def test_language_update(nlp):
|
||||||
doc = Doc(nlp.vocab, words=text.split(" "))
|
doc = Doc(nlp.vocab, words=text.split(" "))
|
||||||
gold = GoldParse(doc, **annots)
|
gold = GoldParse(doc, **annots)
|
||||||
# Update with doc and gold objects
|
# Update with doc and gold objects
|
||||||
nlp.update([doc], [gold])
|
nlp.update((doc, gold))
|
||||||
# Update with text and dict
|
# Update with text and dict
|
||||||
nlp.update([text], [annots])
|
nlp.update((text, annots))
|
||||||
# Update with doc object and dict
|
# Update with doc object and dict
|
||||||
nlp.update([doc], [annots])
|
nlp.update((doc, annots))
|
||||||
# Update with text and gold object
|
# Update with text and gold object
|
||||||
nlp.update([text], [gold])
|
nlp.update((text, gold))
|
||||||
|
# Update with empty doc and gold object
|
||||||
|
nlp.update((None, gold))
|
||||||
# Update badly
|
# Update badly
|
||||||
with pytest.raises(IndexError):
|
|
||||||
nlp.update([doc], [])
|
|
||||||
with pytest.raises(IndexError):
|
|
||||||
nlp.update([], [gold])
|
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
nlp.update([text], [wrongkeyannots])
|
nlp.update((doc, None))
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
nlp.update((text, wrongkeyannots))
|
||||||
|
|
||||||
|
|
||||||
def test_language_evaluate(nlp):
|
def test_language_evaluate(nlp):
|
||||||
|
|
|
@ -4,7 +4,7 @@ from __future__ import unicode_literals
|
||||||
from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
||||||
import pytest
|
import pytest
|
||||||
from pytest import approx
|
from pytest import approx
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import Example, GoldParse
|
||||||
from spacy.scorer import Scorer, ROCAUCScore
|
from spacy.scorer import Scorer, ROCAUCScore
|
||||||
from spacy.scorer import _roc_auc_score, _roc_curve
|
from spacy.scorer import _roc_auc_score, _roc_curve
|
||||||
from .util import get_doc
|
from .util import get_doc
|
||||||
|
@ -40,7 +40,7 @@ def test_las_per_type(en_vocab):
|
||||||
deps=annot["deps"],
|
deps=annot["deps"],
|
||||||
)
|
)
|
||||||
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
|
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
|
||||||
scorer.score(doc, gold)
|
scorer.score((doc, gold))
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
assert results["uas"] == 100
|
assert results["uas"] == 100
|
||||||
|
@ -63,7 +63,7 @@ def test_las_per_type(en_vocab):
|
||||||
)
|
)
|
||||||
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
|
gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
|
||||||
doc[0].dep_ = "compound"
|
doc[0].dep_ = "compound"
|
||||||
scorer.score(doc, gold)
|
scorer.score((doc, gold))
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
assert results["uas"] == 100
|
assert results["uas"] == 100
|
||||||
|
@ -85,8 +85,9 @@ def test_ner_per_type(en_vocab):
|
||||||
words=input_.split(" "),
|
words=input_.split(" "),
|
||||||
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
|
ents=[[0, 1, "CARDINAL"], [2, 3, "CARDINAL"]],
|
||||||
)
|
)
|
||||||
gold = GoldParse(doc, entities=annot["entities"])
|
ex = Example(doc=doc)
|
||||||
scorer.score(doc, gold)
|
ex.add_token_annotation(entities=annot["entities"])
|
||||||
|
scorer.score(ex)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
assert results["ents_p"] == 100
|
assert results["ents_p"] == 100
|
||||||
|
@ -105,8 +106,9 @@ def test_ner_per_type(en_vocab):
|
||||||
words=input_.split(" "),
|
words=input_.split(" "),
|
||||||
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
|
ents=[[0, 1, "ORG"], [5, 6, "GPE"], [6, 7, "ORG"]],
|
||||||
)
|
)
|
||||||
gold = GoldParse(doc, entities=annot["entities"])
|
ex = Example(doc=doc)
|
||||||
scorer.score(doc, gold)
|
ex.add_token_annotation(entities=annot["entities"])
|
||||||
|
scorer.score(ex)
|
||||||
results = scorer.scores
|
results = scorer.scores
|
||||||
|
|
||||||
assert results["ents_p"] == approx(66.66666)
|
assert results["ents_p"] == approx(66.66666)
|
||||||
|
|
|
@ -158,7 +158,7 @@ cdef class Tokenizer:
|
||||||
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, texts, batch_size=1000, n_threads=-1):
|
def pipe(self, texts, batch_size=1000, n_threads=-1, as_example=False):
|
||||||
"""Tokenize a stream of texts.
|
"""Tokenize a stream of texts.
|
||||||
|
|
||||||
texts: A sequence of unicode texts.
|
texts: A sequence of unicode texts.
|
||||||
|
|
|
@ -616,31 +616,25 @@ def decaying(start, stop, decay):
|
||||||
curr -= decay
|
curr -= decay
|
||||||
|
|
||||||
|
|
||||||
def minibatch_by_words(items, size, tuples=True, count_words=len):
|
def minibatch_by_words(examples, size, tuples=True, count_words=len):
|
||||||
"""Create minibatches of a given number of words."""
|
"""Create minibatches of a given number of words."""
|
||||||
if isinstance(size, int):
|
if isinstance(size, int):
|
||||||
size_ = itertools.repeat(size)
|
size_ = itertools.repeat(size)
|
||||||
else:
|
else:
|
||||||
size_ = size
|
size_ = size
|
||||||
items = iter(items)
|
examples = iter(examples)
|
||||||
while True:
|
while True:
|
||||||
batch_size = next(size_)
|
batch_size = next(size_)
|
||||||
batch = []
|
batch = []
|
||||||
while batch_size >= 0:
|
while batch_size >= 0:
|
||||||
try:
|
try:
|
||||||
if tuples:
|
example = next(examples)
|
||||||
doc, gold = next(items)
|
|
||||||
else:
|
|
||||||
doc = next(items)
|
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
if batch:
|
if batch:
|
||||||
yield batch
|
yield batch
|
||||||
return
|
return
|
||||||
batch_size -= count_words(doc)
|
batch_size -= count_words(example.doc)
|
||||||
if tuples:
|
batch.append(example)
|
||||||
batch.append((doc, gold))
|
|
||||||
else:
|
|
||||||
batch.append(doc)
|
|
||||||
if batch:
|
if batch:
|
||||||
yield batch
|
yield batch
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user