mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-06 14:40:34 +03:00
Remove the 'pass example into __call__' thing
This commit is contained in:
parent
b3868cd1f8
commit
0714f1fa5c
|
@ -20,7 +20,7 @@ from .defaults import default_nel, default_senter
|
||||||
from .functions import merge_subtokens
|
from .functions import merge_subtokens
|
||||||
from ..language import Language, component
|
from ..language import Language, component
|
||||||
from ..syntax import nonproj
|
from ..syntax import nonproj
|
||||||
from ..gold import Example
|
from ..gold.new_example import NewExample as Example
|
||||||
from ..attrs import POS, ID
|
from ..attrs import POS, ID
|
||||||
from ..util import link_vectors_to_models, create_default_optimizer
|
from ..util import link_vectors_to_models, create_default_optimizer
|
||||||
from ..parts_of_speech import X
|
from ..parts_of_speech import X
|
||||||
|
@ -48,12 +48,6 @@ class Pipe(object):
|
||||||
def from_nlp(cls, nlp, model, **cfg):
|
def from_nlp(cls, nlp, model, **cfg):
|
||||||
return cls(nlp.vocab, model, **cfg)
|
return cls(nlp.vocab, model, **cfg)
|
||||||
|
|
||||||
def _get_doc(self, example):
|
|
||||||
""" Use this method if the `example` can be both a Doc or an Example """
|
|
||||||
if isinstance(example, Doc):
|
|
||||||
return example
|
|
||||||
return example.doc
|
|
||||||
|
|
||||||
def __init__(self, vocab, model, **cfg):
|
def __init__(self, vocab, model, **cfg):
|
||||||
"""Create a new pipe instance."""
|
"""Create a new pipe instance."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
@ -73,18 +67,17 @@ class Pipe(object):
|
||||||
else:
|
else:
|
||||||
self.set_annotations([doc], predictions)
|
self.set_annotations([doc], predictions)
|
||||||
if isinstance(example, Example):
|
if isinstance(example, Example):
|
||||||
example.doc = doc
|
example.predicted = doc
|
||||||
return example
|
return example
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
"""Apply the pipe to a stream of documents.
|
"""Apply the pipe to a stream of documents.
|
||||||
|
|
||||||
Both __call__ and pipe should delegate to the `predict()`
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
and `set_annotations()` methods.
|
and `set_annotations()` methods.
|
||||||
"""
|
"""
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for examples in util.minibatch(stream, size=batch_size):
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
predictions = self.predict(docs)
|
predictions = self.predict(docs)
|
||||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||||
scores, tensors = predictions
|
scores, tensors = predictions
|
||||||
|
@ -94,7 +87,7 @@ class Pipe(object):
|
||||||
|
|
||||||
if as_example:
|
if as_example:
|
||||||
for ex, doc in zip(examples, docs):
|
for ex, doc in zip(examples, docs):
|
||||||
ex.doc = doc
|
ex.predicted = doc
|
||||||
yield ex
|
yield ex
|
||||||
else:
|
else:
|
||||||
yield from docs
|
yield from docs
|
||||||
|
@ -116,7 +109,6 @@ class Pipe(object):
|
||||||
Delegates to predict() and get_loss().
|
Delegates to predict() and get_loss().
|
||||||
"""
|
"""
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = (self._get_doc(ex) for ex in examples)
|
|
||||||
docs = list(self.pipe(docs))
|
docs = list(self.pipe(docs))
|
||||||
|
|
||||||
def rehearse(self, examples, sgd=None, losses=None, **config):
|
def rehearse(self, examples, sgd=None, losses=None, **config):
|
||||||
|
@ -256,27 +248,17 @@ class Tagger(Pipe):
|
||||||
return tuple(self.vocab.morphology.tag_names)
|
return tuple(self.vocab.morphology.tag_names)
|
||||||
|
|
||||||
def __call__(self, example):
|
def __call__(self, example):
|
||||||
doc = self._get_doc(example)
|
|
||||||
tags = self.predict([doc])
|
tags = self.predict([doc])
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags)
|
||||||
if isinstance(example, Example):
|
if isinstance(example, Example):
|
||||||
example.doc = doc
|
example.predicted = doc
|
||||||
return example
|
return example
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
tag_ids = self.predict(docs)
|
tag_ids = self.predict(docs)
|
||||||
assert len(docs) == len(examples)
|
|
||||||
assert len(tag_ids) == len(examples)
|
|
||||||
self.set_annotations(docs, tag_ids)
|
self.set_annotations(docs, tag_ids)
|
||||||
|
|
||||||
if as_example:
|
|
||||||
for ex, doc in zip(examples, docs):
|
|
||||||
ex.doc = doc
|
|
||||||
yield ex
|
|
||||||
else:
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -327,15 +309,17 @@ class Tagger(Pipe):
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
|
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
|
||||||
examples = Example.to_example_objects(examples)
|
for eg in examples:
|
||||||
|
assert isinstance(eg, Example)
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
|
|
||||||
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples])
|
tag_scores, bp_tag_scores = self.model.begin_update(
|
||||||
|
[eg.predicted for eg in examples])
|
||||||
for sc in tag_scores:
|
for sc in tag_scores:
|
||||||
if self.model.ops.xp.isnan(sc.sum()):
|
if self.model.ops.xp.isnan(sc.sum()):
|
||||||
raise ValueError("nan value in scores")
|
raise ValueError("nan value in scores")
|
||||||
|
@ -347,17 +331,16 @@ class Tagger(Pipe):
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [ex.doc for ex in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
self.set_annotations(docs, self._scores2guesses(tag_scores))
|
self.set_annotations(docs, self._scores2guesses(tag_scores))
|
||||||
|
|
||||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
"""Perform a 'rehearsal' update, where we try to match the output of
|
"""Perform a 'rehearsal' update, where we try to match the output of
|
||||||
an initial model.
|
an initial model.
|
||||||
"""
|
"""
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return
|
return
|
||||||
examples = Example.to_example_objects(examples)
|
|
||||||
docs = [ex.doc for ex in examples]
|
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
|
@ -387,7 +370,8 @@ class Tagger(Pipe):
|
||||||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||||
new_tag_map = {}
|
new_tag_map = {}
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for tag in example.token_annotation.tags:
|
for token in example.y:
|
||||||
|
tag = token.tag_
|
||||||
if tag in orig_tag_map:
|
if tag in orig_tag_map:
|
||||||
new_tag_map[tag] = orig_tag_map[tag]
|
new_tag_map[tag] = orig_tag_map[tag]
|
||||||
else:
|
else:
|
||||||
|
@ -575,7 +559,7 @@ class SentenceRecognizer(Tagger):
|
||||||
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
|
||||||
d_scores *= self.model.ops.asarray(known_labels)
|
d_scores *= self.model.ops.asarray(known_labels)
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores**2).sum()
|
||||||
docs = [eg.doc for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
|
@ -687,8 +671,8 @@ class MultitaskObjective(Tagger):
|
||||||
gold_examples = nonproj.preprocess_training_data(get_examples())
|
gold_examples = nonproj.preprocess_training_data(get_examples())
|
||||||
# for raw_text, doc_annot in gold_tuples:
|
# for raw_text, doc_annot in gold_tuples:
|
||||||
for example in gold_examples:
|
for example in gold_examples:
|
||||||
for i in range(len(example.token_annotation.ids)):
|
for token in example.y:
|
||||||
label = self.make_label(i, example.token_annotation)
|
label = self.make_label(token)
|
||||||
if label is not None and label not in self.labels:
|
if label is not None and label not in self.labels:
|
||||||
self.labels[label] = len(self.labels)
|
self.labels[label] = len(self.labels)
|
||||||
self.model.initialize()
|
self.model.initialize()
|
||||||
|
@ -706,11 +690,11 @@ class MultitaskObjective(Tagger):
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
correct = numpy.zeros((scores.shape[0],), dtype="i")
|
||||||
guesses = scores.argmax(axis=1)
|
guesses = scores.argmax(axis=1)
|
||||||
docs = [ex.doc for ex in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
for i, eg in enumerate(examples):
|
for i, eg in enumerate(examples):
|
||||||
# Handles alignment for tokenization differences
|
# Handles alignment for tokenization differences
|
||||||
doc_annots = eg.get_aligned()
|
doc_annots = eg.get_aligned()
|
||||||
for j in range(len(eg.doc)):
|
for j in range(len(eg.predicted)):
|
||||||
tok_annots = {key: values[j] for key, values in tok_annots.items()}
|
tok_annots = {key: values[j] for key, values in tok_annots.items()}
|
||||||
label = self.make_label(j, tok_annots)
|
label = self.make_label(j, tok_annots)
|
||||||
if label is None or label not in self.labels:
|
if label is None or label not in self.labels:
|
||||||
|
@ -724,83 +708,49 @@ class MultitaskObjective(Tagger):
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_dep(i, token_annotation):
|
def make_dep(token):
|
||||||
if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
|
return token.dep_
|
||||||
return None
|
|
||||||
return token_annotation.deps[i]
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_tag(i, token_annotation):
|
def make_tag(token):
|
||||||
return token_annotation.tags[i]
|
return token.tag_
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_ent(i, token_annotation):
|
def make_ent(token):
|
||||||
if token_annotation.entities is None:
|
if token.ent_iob_ == "O":
|
||||||
return None
|
return "O"
|
||||||
return token_annotation.entities[i]
|
else:
|
||||||
|
return token.ent_iob_ + "-" + token.ent_type_
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_dep_tag_offset(i, token_annotation):
|
def make_dep_tag_offset(token):
|
||||||
if token_annotation.deps[i] is None or token_annotation.heads[i] is None:
|
dep = token.dep_
|
||||||
return None
|
tag = token.tag_
|
||||||
offset = token_annotation.heads[i] - i
|
offset = token.head.i - token.i
|
||||||
offset = min(offset, 2)
|
offset = min(offset, 2)
|
||||||
offset = max(offset, -2)
|
offset = max(offset, -2)
|
||||||
return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}"
|
return f"{dep}-{tag}:{offset}"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_ent_tag(i, token_annotation):
|
def make_ent_tag(token):
|
||||||
if token_annotation.entities is None or token_annotation.entities[i] is None:
|
if token.ent_iob_ == "O":
|
||||||
return None
|
ent = "O"
|
||||||
else:
|
else:
|
||||||
return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}"
|
ent = token.ent_iob_ + "-" + token.ent_type_
|
||||||
|
tag = token.tag_
|
||||||
|
return f"{tag}-{ent}"
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make_sent_start(target, token_annotation, cache=True, _cache={}):
|
def make_sent_start(token):
|
||||||
"""A multi-task objective for representing sentence boundaries,
|
"""A multi-task objective for representing sentence boundaries,
|
||||||
using BILU scheme. (O is impossible)
|
using BILU scheme. (O is impossible)
|
||||||
|
|
||||||
The implementation of this method uses an internal cache that relies
|
|
||||||
on the identity of the heads array, to avoid requiring a new piece
|
|
||||||
of gold data. You can pass cache=False if you know the cache will
|
|
||||||
do the wrong thing.
|
|
||||||
"""
|
"""
|
||||||
words = token_annotation.words
|
if token.is_sent_start and token.is_sent_end:
|
||||||
heads = token_annotation.heads
|
return "U-SENT"
|
||||||
assert len(words) == len(heads)
|
elif token.is_sent_start:
|
||||||
assert target < len(words), (target, len(words))
|
return "B-SENT"
|
||||||
if cache:
|
|
||||||
if id(heads) in _cache:
|
|
||||||
return _cache[id(heads)][target]
|
|
||||||
else:
|
else:
|
||||||
for key in list(_cache.keys()):
|
return "I-SENT"
|
||||||
_cache.pop(key)
|
|
||||||
sent_tags = ["I-SENT"] * len(words)
|
|
||||||
_cache[id(heads)] = sent_tags
|
|
||||||
else:
|
|
||||||
sent_tags = ["I-SENT"] * len(words)
|
|
||||||
|
|
||||||
def _find_root(child):
|
|
||||||
seen = set([child])
|
|
||||||
while child is not None and heads[child] != child:
|
|
||||||
seen.add(child)
|
|
||||||
child = heads[child]
|
|
||||||
return child
|
|
||||||
|
|
||||||
sentences = {}
|
|
||||||
for i in range(len(words)):
|
|
||||||
root = _find_root(i)
|
|
||||||
if root is None:
|
|
||||||
sent_tags[i] = None
|
|
||||||
else:
|
|
||||||
sentences.setdefault(root, []).append(i)
|
|
||||||
for root, span in sorted(sentences.items()):
|
|
||||||
if len(span) == 1:
|
|
||||||
sent_tags[span[0]] = "U-SENT"
|
|
||||||
else:
|
|
||||||
sent_tags[span[0]] = "B-SENT"
|
|
||||||
sent_tags[span[-1]] = "L-SENT"
|
|
||||||
return sent_tags[target]
|
|
||||||
|
|
||||||
|
|
||||||
class ClozeMultitask(Pipe):
|
class ClozeMultitask(Pipe):
|
||||||
|
@ -833,7 +783,7 @@ class ClozeMultitask(Pipe):
|
||||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||||
# and look them up all at once. This prevents data copying.
|
# and look them up all at once. This prevents data copying.
|
||||||
ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples])
|
ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples])
|
||||||
target = vectors[ids]
|
target = vectors[ids]
|
||||||
gradient = self.distance.get_grad(prediction, target)
|
gradient = self.distance.get_grad(prediction, target)
|
||||||
loss = self.distance.get_loss(prediction, target)
|
loss = self.distance.get_loss(prediction, target)
|
||||||
|
@ -843,11 +793,12 @@ class ClozeMultitask(Pipe):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
examples = Example.to_example_objects(examples)
|
|
||||||
if losses is not None and self.name not in losses:
|
if losses is not None and self.name not in losses:
|
||||||
losses[self.name] = 0.
|
losses[self.name] = 0.
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples])
|
predictions, bp_predictions = self.model.begin_update(
|
||||||
|
[eg.predicted for eg in examples])
|
||||||
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
|
||||||
bp_predictions(d_predictions)
|
bp_predictions(d_predictions)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
|
@ -883,16 +834,9 @@ class TextCategorizer(Pipe):
|
||||||
self.cfg["labels"] = tuple(value)
|
self.cfg["labels"] = tuple(value)
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
scores, tensors = self.predict(docs)
|
scores, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
|
|
||||||
if as_example:
|
|
||||||
for ex, doc in zip(examples, docs):
|
|
||||||
ex.doc = doc
|
|
||||||
yield ex
|
|
||||||
else:
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -914,12 +858,15 @@ class TextCategorizer(Pipe):
|
||||||
doc.cats[label] = float(scores[i, j])
|
doc.cats[label] = float(scores[i, j])
|
||||||
|
|
||||||
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
|
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
|
||||||
examples = Example.to_example_objects(examples)
|
for eg in examples:
|
||||||
if not any(len(ex.doc) if ex.doc else 0 for ex in examples):
|
assert isinstance(eg, Example)
|
||||||
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
scores, bp_scores = self.model.begin_update([ex.doc for ex in examples])
|
scores, bp_scores = self.model.begin_update(
|
||||||
|
[eg.predicted for eg in examples]
|
||||||
|
)
|
||||||
loss, d_scores = self.get_loss(examples, scores)
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
bp_scores(d_scores)
|
bp_scores(d_scores)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
|
@ -928,14 +875,15 @@ class TextCategorizer(Pipe):
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
docs = [ex.doc for ex in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
self.set_annotations(docs, scores=scores)
|
self.set_annotations(docs, scores=scores)
|
||||||
|
|
||||||
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
def rehearse(self, examples, drop=0., sgd=None, losses=None):
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return
|
return
|
||||||
examples = Example.to_example_objects(examples)
|
for eg in examples:
|
||||||
docs=[ex.doc for ex in examples]
|
assert isinstance(eg, Example)
|
||||||
|
docs = [eg.predicted for eg in examples]
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
return
|
return
|
||||||
|
@ -955,8 +903,8 @@ class TextCategorizer(Pipe):
|
||||||
not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
|
not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f")
|
||||||
for i, eg in enumerate(examples):
|
for i, eg in enumerate(examples):
|
||||||
for j, label in enumerate(self.labels):
|
for j, label in enumerate(self.labels):
|
||||||
if label in eg.doc_annotation.cats:
|
if label in eg.predicted.cats:
|
||||||
truths[i, j] = eg.doc_annotation.cats[label]
|
truths[i, j] = eg.reference.cats[label]
|
||||||
else:
|
else:
|
||||||
not_missing[i, j] = 0.
|
not_missing[i, j] = 0.
|
||||||
truths = self.model.ops.asarray(truths)
|
truths = self.model.ops.asarray(truths)
|
||||||
|
@ -993,7 +941,7 @@ class TextCategorizer(Pipe):
|
||||||
# TODO: begin_training is not guaranteed to see all data / labels ?
|
# TODO: begin_training is not guaranteed to see all data / labels ?
|
||||||
examples = list(get_examples())
|
examples = list(get_examples())
|
||||||
for example in examples:
|
for example in examples:
|
||||||
for cat in example.doc_annotation.cats:
|
for cat in example.y.cats:
|
||||||
self.add_label(cat)
|
self.add_label(cat)
|
||||||
self.require_labels()
|
self.require_labels()
|
||||||
docs = [Doc(Vocab(), words=["hello"])]
|
docs = [Doc(Vocab(), words=["hello"])]
|
||||||
|
@ -1152,21 +1100,22 @@ class EntityLinker(Pipe):
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
if not examples:
|
if not examples:
|
||||||
return 0
|
return 0
|
||||||
examples = Example.to_example_objects(examples)
|
for eg in examples:
|
||||||
|
assert isinstance(eg, Example)
|
||||||
sentence_docs = []
|
sentence_docs = []
|
||||||
docs = [ex.doc for ex in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
if set_annotations:
|
if set_annotations:
|
||||||
# This seems simpler than other ways to get that exact output -- but
|
# This seems simpler than other ways to get that exact output -- but
|
||||||
# it does run the model twice :(
|
# it does run the model twice :(
|
||||||
predictions = self.model.predict(docs)
|
predictions = self.model.predict(docs)
|
||||||
|
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
doc = eg.doc
|
doc = eg.predicted
|
||||||
ents_by_offset = dict()
|
ents_by_offset = dict()
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
ents_by_offset[(ent.start_char, ent.end_char)] = ent
|
ents_by_offset[(ent.start_char, ent.end_char)] = ent
|
||||||
|
links = self._get_links_from_doc(eg.reference)
|
||||||
for entity, kb_dict in eg.doc_annotation.links.items():
|
for entity, kb_dict in links.items():
|
||||||
if isinstance(entity, str):
|
if isinstance(entity, str):
|
||||||
entity = literal_eval(entity)
|
entity = literal_eval(entity)
|
||||||
start, end = entity
|
start, end = entity
|
||||||
|
@ -1204,7 +1153,8 @@ class EntityLinker(Pipe):
|
||||||
def get_similarity_loss(self, examples, scores):
|
def get_similarity_loss(self, examples, scores):
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
for entity, kb_dict in eg.doc_annotation.links.items():
|
links = self._get_links_from_doc(eg.reference)
|
||||||
|
for entity, kb_dict in links.items():
|
||||||
for kb_id, value in kb_dict.items():
|
for kb_id, value in kb_dict.items():
|
||||||
# this loss function assumes we're only using positive examples
|
# this loss function assumes we're only using positive examples
|
||||||
if value:
|
if value:
|
||||||
|
@ -1223,8 +1173,9 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
cats = []
|
cats = []
|
||||||
for ex in examples:
|
for eg in examples:
|
||||||
for entity, kb_dict in ex.doc_annotation.links.items():
|
links = self._get_links_from_doc(eg.reference)
|
||||||
|
for entity, kb_dict in links.items():
|
||||||
for kb_id, value in kb_dict.items():
|
for kb_id, value in kb_dict.items():
|
||||||
cats.append([value])
|
cats.append([value])
|
||||||
|
|
||||||
|
@ -1237,26 +1188,21 @@ class EntityLinker(Pipe):
|
||||||
loss = loss / len(cats)
|
loss = loss / len(cats)
|
||||||
return loss, d_scores
|
return loss, d_scores
|
||||||
|
|
||||||
def __call__(self, example):
|
def _get_links_from_doc(self, doc):
|
||||||
doc = self._get_doc(example)
|
return {}
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
kb_ids, tensors = self.predict([doc])
|
kb_ids, tensors = self.predict([doc])
|
||||||
self.set_annotations([doc], kb_ids, tensors=tensors)
|
self.set_annotations([doc], kb_ids, tensors=tensors)
|
||||||
if isinstance(example, Example):
|
if isinstance(example, Example):
|
||||||
example.doc = doc
|
example.x = doc
|
||||||
return example
|
return example
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
kb_ids, tensors = self.predict(docs)
|
kb_ids, tensors = self.predict(docs)
|
||||||
self.set_annotations(docs, kb_ids, tensors=tensors)
|
self.set_annotations(docs, kb_ids, tensors=tensors)
|
||||||
|
|
||||||
if as_example:
|
|
||||||
for ex, doc in zip(examples, docs):
|
|
||||||
ex.doc = doc
|
|
||||||
yield ex
|
|
||||||
else:
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
@ -1433,7 +1379,7 @@ class Sentencizer(Pipe):
|
||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def __call__(self, example):
|
def __call__(self, doc):
|
||||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||||
|
|
||||||
example (Doc or Example): The document to process.
|
example (Doc or Example): The document to process.
|
||||||
|
@ -1441,7 +1387,6 @@ class Sentencizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/sentencizer#call
|
DOCS: https://spacy.io/api/sentencizer#call
|
||||||
"""
|
"""
|
||||||
doc = self._get_doc(example)
|
|
||||||
start = 0
|
start = 0
|
||||||
seen_period = False
|
seen_period = False
|
||||||
for i, token in enumerate(doc):
|
for i, token in enumerate(doc):
|
||||||
|
@ -1460,20 +1405,14 @@ class Sentencizer(Pipe):
|
||||||
return example
|
return example
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
for examples in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = [self._get_doc(ex) for ex in examples]
|
|
||||||
predictions = self.predict(docs)
|
predictions = self.predict(docs)
|
||||||
if isinstance(predictions, tuple) and len(tuple) == 2:
|
if isinstance(predictions, tuple) and len(tuple) == 2:
|
||||||
scores, tensors = predictions
|
scores, tensors = predictions
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
self.set_annotations(docs, scores, tensors=tensors)
|
||||||
else:
|
else:
|
||||||
self.set_annotations(docs, predictions)
|
self.set_annotations(docs, predictions)
|
||||||
if as_example:
|
|
||||||
for ex, doc in zip(examples, docs):
|
|
||||||
ex.doc = doc
|
|
||||||
yield ex
|
|
||||||
else:
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user