diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 2c40738f6..c6233be90 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -20,7 +20,7 @@ from .defaults import default_nel, default_senter from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj -from ..gold import Example +from ..gold.new_example import NewExample as Example from ..attrs import POS, ID from ..util import link_vectors_to_models, create_default_optimizer from ..parts_of_speech import X @@ -48,12 +48,6 @@ class Pipe(object): def from_nlp(cls, nlp, model, **cfg): return cls(nlp.vocab, model, **cfg) - def _get_doc(self, example): - """ Use this method if the `example` can be both a Doc or an Example """ - if isinstance(example, Doc): - return example - return example.doc - def __init__(self, vocab, model, **cfg): """Create a new pipe instance.""" raise NotImplementedError @@ -73,18 +67,17 @@ class Pipe(object): else: self.set_annotations([doc], predictions) if isinstance(example, Example): - example.doc = doc + example.predicted = doc return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): + def pipe(self, stream, batch_size=128, n_threads=-1): """Apply the pipe to a stream of documents. Both __call__ and pipe should delegate to the `predict()` and `set_annotations()` methods. """ for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: scores, tensors = predictions @@ -94,7 +87,7 @@ class Pipe(object): if as_example: for ex, doc in zip(examples, docs): - ex.doc = doc + ex.predicted = doc yield ex else: yield from docs @@ -116,7 +109,6 @@ class Pipe(object): Delegates to predict() and get_loss(). """ if set_annotations: - docs = (self._get_doc(ex) for ex in examples) docs = list(self.pipe(docs)) def rehearse(self, examples, sgd=None, losses=None, **config): @@ -256,28 +248,18 @@ class Tagger(Pipe): return tuple(self.vocab.morphology.tag_names) def __call__(self, example): - doc = self._get_doc(example) tags = self.predict([doc]) self.set_annotations([doc], tags) if isinstance(example, Example): - example.doc = doc + example.predicted = doc return example return doc def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + for docs in util.minibatch(stream, size=batch_size): tag_ids = self.predict(docs) - assert len(docs) == len(examples) - assert len(tag_ids) == len(examples) self.set_annotations(docs, tag_ids) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): if not any(len(doc) for doc in docs): @@ -327,15 +309,17 @@ class Tagger(Pipe): doc.is_tagged = True def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False): - examples = Example.to_example_objects(examples) + for eg in examples: + assert isinstance(eg, Example) if losses is not None and self.name not in losses: losses[self.name] = 0. - if not any(len(ex.doc) if ex.doc else 0 for ex in examples): + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): # Handle cases where there are no tokens in any docs. return set_dropout_rate(self.model, drop) - tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples]) + tag_scores, bp_tag_scores = self.model.begin_update( + [eg.predicted for eg in examples]) for sc in tag_scores: if self.model.ops.xp.isnan(sc.sum()): raise ValueError("nan value in scores") @@ -347,17 +331,16 @@ class Tagger(Pipe): if losses is not None: losses[self.name] += loss if set_annotations: - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] self.set_annotations(docs, self._scores2guesses(tag_scores)) def rehearse(self, examples, drop=0., sgd=None, losses=None): """Perform a 'rehearsal' update, where we try to match the output of an initial model. """ + docs = [eg.predicted for eg in examples] if self._rehearsal_model is None: return - examples = Example.to_example_objects(examples) - docs = [ex.doc for ex in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return @@ -387,7 +370,8 @@ class Tagger(Pipe): orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = {} for example in get_examples(): - for tag in example.token_annotation.tags: + for token in example.y: + tag = token.tag_ if tag in orig_tag_map: new_tag_map[tag] = orig_tag_map[tag] else: @@ -575,7 +559,7 @@ class SentenceRecognizer(Tagger): d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) d_scores *= self.model.ops.asarray(known_labels) loss = (d_scores**2).sum() - docs = [eg.doc for eg in examples] + docs = [eg.predicted for eg in examples] d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) return float(loss), d_scores @@ -687,8 +671,8 @@ class MultitaskObjective(Tagger): gold_examples = nonproj.preprocess_training_data(get_examples()) # for raw_text, doc_annot in gold_tuples: for example in gold_examples: - for i in range(len(example.token_annotation.ids)): - label = self.make_label(i, example.token_annotation) + for token in example.y: + label = self.make_label(token) if label is not None and label not in self.labels: self.labels[label] = len(self.labels) self.model.initialize() @@ -706,11 +690,11 @@ class MultitaskObjective(Tagger): cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype="i") guesses = scores.argmax(axis=1) - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] for i, eg in enumerate(examples): # Handles alignment for tokenization differences doc_annots = eg.get_aligned() - for j in range(len(eg.doc)): + for j in range(len(eg.predicted)): tok_annots = {key: values[j] for key, values in tok_annots.items()} label = self.make_label(j, tok_annots) if label is None or label not in self.labels: @@ -724,83 +708,49 @@ class MultitaskObjective(Tagger): return float(loss), d_scores @staticmethod - def make_dep(i, token_annotation): - if token_annotation.deps[i] is None or token_annotation.heads[i] is None: - return None - return token_annotation.deps[i] + def make_dep(token): + return token.dep_ @staticmethod - def make_tag(i, token_annotation): - return token_annotation.tags[i] + def make_tag(token): + return token.tag_ @staticmethod - def make_ent(i, token_annotation): - if token_annotation.entities is None: - return None - return token_annotation.entities[i] + def make_ent(token): + if token.ent_iob_ == "O": + return "O" + else: + return token.ent_iob_ + "-" + token.ent_type_ @staticmethod - def make_dep_tag_offset(i, token_annotation): - if token_annotation.deps[i] is None or token_annotation.heads[i] is None: - return None - offset = token_annotation.heads[i] - i + def make_dep_tag_offset(token): + dep = token.dep_ + tag = token.tag_ + offset = token.head.i - token.i offset = min(offset, 2) offset = max(offset, -2) - return f"{token_annotation.deps[i]}-{token_annotation.tags[i]}:{offset}" + return f"{dep}-{tag}:{offset}" @staticmethod - def make_ent_tag(i, token_annotation): - if token_annotation.entities is None or token_annotation.entities[i] is None: - return None + def make_ent_tag(token): + if token.ent_iob_ == "O": + ent = "O" else: - return f"{token_annotation.tags[i]}-{token_annotation.entities[i]}" + ent = token.ent_iob_ + "-" + token.ent_type_ + tag = token.tag_ + return f"{tag}-{ent}" @staticmethod - def make_sent_start(target, token_annotation, cache=True, _cache={}): + def make_sent_start(token): """A multi-task objective for representing sentence boundaries, using BILU scheme. (O is impossible) - - The implementation of this method uses an internal cache that relies - on the identity of the heads array, to avoid requiring a new piece - of gold data. You can pass cache=False if you know the cache will - do the wrong thing. """ - words = token_annotation.words - heads = token_annotation.heads - assert len(words) == len(heads) - assert target < len(words), (target, len(words)) - if cache: - if id(heads) in _cache: - return _cache[id(heads)][target] - else: - for key in list(_cache.keys()): - _cache.pop(key) - sent_tags = ["I-SENT"] * len(words) - _cache[id(heads)] = sent_tags + if token.is_sent_start and token.is_sent_end: + return "U-SENT" + elif token.is_sent_start: + return "B-SENT" else: - sent_tags = ["I-SENT"] * len(words) - - def _find_root(child): - seen = set([child]) - while child is not None and heads[child] != child: - seen.add(child) - child = heads[child] - return child - - sentences = {} - for i in range(len(words)): - root = _find_root(i) - if root is None: - sent_tags[i] = None - else: - sentences.setdefault(root, []).append(i) - for root, span in sorted(sentences.items()): - if len(span) == 1: - sent_tags[span[0]] = "U-SENT" - else: - sent_tags[span[0]] = "B-SENT" - sent_tags[span[-1]] = "L-SENT" - return sent_tags[target] + return "I-SENT" class ClozeMultitask(Pipe): @@ -833,7 +783,7 @@ class ClozeMultitask(Pipe): # token.vector values, but that's a bit inefficient, especially on GPU. # Instead we fetch the index into the vectors table for each of our tokens, # and look them up all at once. This prevents data copying. - ids = self.model.ops.flatten([ex.doc.to_array(ID).ravel() for ex in examples]) + ids = self.model.ops.flatten([eg.predicted.to_array(ID).ravel() for eg in examples]) target = vectors[ids] gradient = self.distance.get_grad(prediction, target) loss = self.distance.get_loss(prediction, target) @@ -843,11 +793,12 @@ class ClozeMultitask(Pipe): pass def rehearse(self, examples, drop=0., sgd=None, losses=None): - examples = Example.to_example_objects(examples) if losses is not None and self.name not in losses: losses[self.name] = 0. + docs = [eg.predicted for eg in examples] set_dropout_rate(self.model, drop) - predictions, bp_predictions = self.model.begin_update([ex.doc for ex in examples]) + predictions, bp_predictions = self.model.begin_update( + [eg.predicted for eg in examples]) loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions) bp_predictions(d_predictions) if sgd is not None: @@ -883,17 +834,10 @@ class TextCategorizer(Pipe): self.cfg["labels"] = tuple(value) def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + for docs in util.minibatch(stream, size=batch_size): scores, tensors = self.predict(docs) self.set_annotations(docs, scores, tensors=tensors) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): tensors = [doc.tensor for doc in docs] @@ -914,12 +858,15 @@ class TextCategorizer(Pipe): doc.cats[label] = float(scores[i, j]) def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None): - examples = Example.to_example_objects(examples) - if not any(len(ex.doc) if ex.doc else 0 for ex in examples): + for eg in examples: + assert isinstance(eg, Example) + if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples): # Handle cases where there are no tokens in any docs. return set_dropout_rate(self.model, drop) - scores, bp_scores = self.model.begin_update([ex.doc for ex in examples]) + scores, bp_scores = self.model.begin_update( + [eg.predicted for eg in examples] + ) loss, d_scores = self.get_loss(examples, scores) bp_scores(d_scores) if sgd is not None: @@ -928,14 +875,15 @@ class TextCategorizer(Pipe): losses.setdefault(self.name, 0.0) losses[self.name] += loss if set_annotations: - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] self.set_annotations(docs, scores=scores) def rehearse(self, examples, drop=0., sgd=None, losses=None): if self._rehearsal_model is None: return - examples = Example.to_example_objects(examples) - docs=[ex.doc for ex in examples] + for eg in examples: + assert isinstance(eg, Example) + docs = [eg.predicted for eg in examples] if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. return @@ -955,8 +903,8 @@ class TextCategorizer(Pipe): not_missing = numpy.ones((len(examples), len(self.labels)), dtype="f") for i, eg in enumerate(examples): for j, label in enumerate(self.labels): - if label in eg.doc_annotation.cats: - truths[i, j] = eg.doc_annotation.cats[label] + if label in eg.predicted.cats: + truths[i, j] = eg.reference.cats[label] else: not_missing[i, j] = 0. truths = self.model.ops.asarray(truths) @@ -993,7 +941,7 @@ class TextCategorizer(Pipe): # TODO: begin_training is not guaranteed to see all data / labels ? examples = list(get_examples()) for example in examples: - for cat in example.doc_annotation.cats: + for cat in example.y.cats: self.add_label(cat) self.require_labels() docs = [Doc(Vocab(), words=["hello"])] @@ -1152,21 +1100,22 @@ class EntityLinker(Pipe): losses.setdefault(self.name, 0.0) if not examples: return 0 - examples = Example.to_example_objects(examples) + for eg in examples: + assert isinstance(eg, Example) sentence_docs = [] - docs = [ex.doc for ex in examples] + docs = [eg.predicted for eg in examples] if set_annotations: # This seems simpler than other ways to get that exact output -- but # it does run the model twice :( predictions = self.model.predict(docs) for eg in examples: - doc = eg.doc + doc = eg.predicted ents_by_offset = dict() for ent in doc.ents: ents_by_offset[(ent.start_char, ent.end_char)] = ent - - for entity, kb_dict in eg.doc_annotation.links.items(): + links = self._get_links_from_doc(eg.reference) + for entity, kb_dict in links.items(): if isinstance(entity, str): entity = literal_eval(entity) start, end = entity @@ -1204,7 +1153,8 @@ class EntityLinker(Pipe): def get_similarity_loss(self, examples, scores): entity_encodings = [] for eg in examples: - for entity, kb_dict in eg.doc_annotation.links.items(): + links = self._get_links_from_doc(eg.reference) + for entity, kb_dict in links.items(): for kb_id, value in kb_dict.items(): # this loss function assumes we're only using positive examples if value: @@ -1223,8 +1173,9 @@ class EntityLinker(Pipe): def get_loss(self, examples, scores): cats = [] - for ex in examples: - for entity, kb_dict in ex.doc_annotation.links.items(): + for eg in examples: + links = self._get_links_from_doc(eg.reference) + for entity, kb_dict in links.items(): for kb_id, value in kb_dict.items(): cats.append([value]) @@ -1237,27 +1188,22 @@ class EntityLinker(Pipe): loss = loss / len(cats) return loss, d_scores - def __call__(self, example): - doc = self._get_doc(example) + def _get_links_from_doc(self, doc): + return {} + + def __call__(self, doc): kb_ids, tensors = self.predict([doc]) self.set_annotations([doc], kb_ids, tensors=tensors) if isinstance(example, Example): - example.doc = doc + example.x = doc return example return doc def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + for docs in util.minibatch(stream, size=batch_size): kb_ids, tensors = self.predict(docs) self.set_annotations(docs, kb_ids, tensors=tensors) - - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): """ Return the KB IDs for each entity in each doc, including NIL if there is no prediction """ @@ -1433,7 +1379,7 @@ class Sentencizer(Pipe): ): pass - def __call__(self, example): + def __call__(self, doc): """Apply the sentencizer to a Doc and set Token.is_sent_start. example (Doc or Example): The document to process. @@ -1441,7 +1387,6 @@ class Sentencizer(Pipe): DOCS: https://spacy.io/api/sentencizer#call """ - doc = self._get_doc(example) start = 0 seen_period = False for i, token in enumerate(doc): @@ -1460,21 +1405,15 @@ class Sentencizer(Pipe): return example return doc - def pipe(self, stream, batch_size=128, n_threads=-1, as_example=False): - for examples in util.minibatch(stream, size=batch_size): - docs = [self._get_doc(ex) for ex in examples] + def pipe(self, stream, batch_size=128, n_threads=-1): + for docs in util.minibatch(stream, size=batch_size): predictions = self.predict(docs) if isinstance(predictions, tuple) and len(tuple) == 2: scores, tensors = predictions self.set_annotations(docs, scores, tensors=tensors) else: self.set_annotations(docs, predictions) - if as_example: - for ex, doc in zip(examples, docs): - ex.doc = doc - yield ex - else: - yield from docs + yield from docs def predict(self, docs): """Apply the pipeline's model to a batch of docs, without