mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 17:10:36 +03:00
Merge branch 'whatif/arrow' of https://github.com/explosion/spaCy into whatif/arrow
This commit is contained in:
commit
a389866df6
|
@ -4,8 +4,10 @@ import random
|
||||||
import warnings
|
import warnings
|
||||||
import srsly
|
import srsly
|
||||||
import spacy
|
import spacy
|
||||||
|
from spacy.gold import Example
|
||||||
from spacy.util import minibatch, compounding
|
from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
# TODO: further fix & test this script for v.3 ? (read_gold_data is never called)
|
||||||
|
|
||||||
LABEL = "ANIMAL"
|
LABEL = "ANIMAL"
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
|
@ -35,15 +37,13 @@ def read_raw_data(nlp, jsonl_loc):
|
||||||
|
|
||||||
|
|
||||||
def read_gold_data(nlp, gold_loc):
|
def read_gold_data(nlp, gold_loc):
|
||||||
docs = []
|
examples = []
|
||||||
golds = []
|
|
||||||
for json_obj in srsly.read_jsonl(gold_loc):
|
for json_obj in srsly.read_jsonl(gold_loc):
|
||||||
doc = nlp.make_doc(json_obj["text"])
|
doc = nlp.make_doc(json_obj["text"])
|
||||||
ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]]
|
ents = [(ent["start"], ent["end"], ent["label"]) for ent in json_obj["spans"]]
|
||||||
gold = GoldParse(doc, entities=ents)
|
example = Example.from_dict(doc, {"entities": ents})
|
||||||
docs.append(doc)
|
examples.append(example)
|
||||||
golds.append(gold)
|
return examples
|
||||||
return list(zip(docs, golds))
|
|
||||||
|
|
||||||
|
|
||||||
def main(model_name, unlabelled_loc):
|
def main(model_name, unlabelled_loc):
|
||||||
|
|
|
@ -62,11 +62,10 @@ def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=Non
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, cats in zip(train_texts, train_cats):
|
for text, cats in zip(train_texts, train_cats):
|
||||||
doc = nlp.make_doc(text)
|
doc = nlp.make_doc(text)
|
||||||
gold = GoldParse(doc, cats=cats)
|
example = Example.from_dict(doc, {"cats": cats})
|
||||||
for cat in cats:
|
for cat in cats:
|
||||||
textcat.add_label(cat)
|
textcat.add_label(cat)
|
||||||
ex = Example.from_gold(gold, doc=doc)
|
train_examples.append(example)
|
||||||
train_examples.append(ex)
|
|
||||||
|
|
||||||
with nlp.select_pipes(enable="textcat"): # only train textcat
|
with nlp.select_pipes(enable="textcat"): # only train textcat
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
|
|
|
@ -231,8 +231,8 @@ def train(
|
||||||
# check whether the setting 'exclusive_classes' corresponds to the provided training data
|
# check whether the setting 'exclusive_classes' corresponds to the provided training data
|
||||||
if textcat_multilabel:
|
if textcat_multilabel:
|
||||||
multilabel_found = False
|
multilabel_found = False
|
||||||
for ex in corpus.train_examples:
|
for eg in corpus.train_annotations:
|
||||||
cats = ex.doc_annotation.cats
|
cats = eg.reference.cats
|
||||||
textcat_labels.update(cats.keys())
|
textcat_labels.update(cats.keys())
|
||||||
if list(cats.values()).count(1.0) != 1:
|
if list(cats.values()).count(1.0) != 1:
|
||||||
multilabel_found = True
|
multilabel_found = True
|
||||||
|
@ -244,8 +244,8 @@ def train(
|
||||||
"mutually exclusive classes more accurately."
|
"mutually exclusive classes more accurately."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
for ex in corpus.train_examples:
|
for eg in corpus.train_annotations:
|
||||||
cats = ex.doc_annotation.cats
|
cats = eg.reference.cats
|
||||||
textcat_labels.update(cats.keys())
|
textcat_labels.update(cats.keys())
|
||||||
if list(cats.values()).count(1.0) != 1:
|
if list(cats.values()).count(1.0) != 1:
|
||||||
msg.fail(
|
msg.fail(
|
||||||
|
@ -346,10 +346,8 @@ def train(
|
||||||
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
||||||
# Clean up the objects to faciliate garbage collection.
|
# Clean up the objects to faciliate garbage collection.
|
||||||
for eg in batch:
|
for eg in batch:
|
||||||
eg.doc = None
|
eg.reference = None
|
||||||
eg.goldparse = None
|
eg.predicted = None
|
||||||
eg.doc_annotation = None
|
|
||||||
eg.token_annotation = None
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
msg.warn(
|
msg.warn(
|
||||||
f"Aborting and saving the final best model. "
|
f"Aborting and saving the final best model. "
|
||||||
|
@ -469,7 +467,7 @@ def train_while_improving(
|
||||||
|
|
||||||
Every iteration, the function yields out a tuple with:
|
Every iteration, the function yields out a tuple with:
|
||||||
|
|
||||||
* batch: A zipped sequence of Tuple[Doc, GoldParse] pairs.
|
* batch: A list of Example objects.
|
||||||
* info: A dict with various information about the last update (see below).
|
* info: A dict with various information about the last update (see below).
|
||||||
* is_best_checkpoint: A value in None, False, True, indicating whether this
|
* is_best_checkpoint: A value in None, False, True, indicating whether this
|
||||||
was the best evaluation so far. You should use this to save the model
|
was the best evaluation so far. You should use this to save the model
|
||||||
|
|
|
@ -72,7 +72,7 @@ class GoldCorpus(object):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def read_annotations(locs, limit=0):
|
def read_annotations(locs, limit=0):
|
||||||
""" Yield training examples """
|
""" Yield training examples as example dicts """
|
||||||
i = 0
|
i = 0
|
||||||
for loc in locs:
|
for loc in locs:
|
||||||
loc = util.ensure_path(loc)
|
loc = util.ensure_path(loc)
|
||||||
|
|
|
@ -117,7 +117,7 @@ cdef class Example:
|
||||||
i = j2i_multi[j]
|
i = j2i_multi[j]
|
||||||
if output[i] is None:
|
if output[i] is None:
|
||||||
output[i] = gold_values[j]
|
output[i] = gold_values[j]
|
||||||
if as_string and field not in ["ENT_IOB"]:
|
if as_string and field not in ["ENT_IOB", "SENT_START"]:
|
||||||
output = [vocab.strings[o] if o is not None else o for o in output]
|
output = [vocab.strings[o] if o is not None else o for o in output]
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
@ -146,22 +146,19 @@ cdef class Example:
|
||||||
sent_starts and return a list of the new Examples"""
|
sent_starts and return a list of the new Examples"""
|
||||||
if not self.reference.is_sentenced:
|
if not self.reference.is_sentenced:
|
||||||
return [self]
|
return [self]
|
||||||
# TODO: Do this for misaligned somehow?
|
|
||||||
predicted_words = [t.text for t in self.predicted]
|
sent_starts = self.get_aligned("SENT_START")
|
||||||
reference_words = [t.text for t in self.reference]
|
sent_starts.append(1) # appending virtual start of a next sentence to facilitate search
|
||||||
if predicted_words != reference_words:
|
|
||||||
raise NotImplementedError("TODO: Implement this")
|
|
||||||
# Implement the easy case.
|
|
||||||
output = []
|
output = []
|
||||||
cls = self.__class__
|
pred_start = 0
|
||||||
for sent in self.reference.sents:
|
for sent in self.reference.sents:
|
||||||
# I guess for misaligned we just need to use the gold_to_cand?
|
new_ref = sent.as_doc()
|
||||||
output.append(
|
pred_end = sent_starts.index(1, pred_start+1) # find where the next sentence starts
|
||||||
cls(
|
new_pred = self.predicted[pred_start : pred_end].as_doc()
|
||||||
self.predicted[sent.start : sent.end + 1].as_doc(),
|
output.append(Example(new_pred, new_ref))
|
||||||
sent.as_doc()
|
pred_start = pred_end
|
||||||
)
|
|
||||||
)
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
property text:
|
property text:
|
||||||
|
|
|
@ -108,12 +108,18 @@ def json_to_annotations(doc):
|
||||||
words.append(token["orth"])
|
words.append(token["orth"])
|
||||||
spaces.append(token.get("space", True))
|
spaces.append(token.get("space", True))
|
||||||
ids.append(token.get('id', sent_start_i + i))
|
ids.append(token.get('id', sent_start_i + i))
|
||||||
tags.append(token.get('tag', "-"))
|
if "tag" in token:
|
||||||
pos.append(token.get("pos", ""))
|
tags.append(token["tag"])
|
||||||
morphs.append(token.get("morph", ""))
|
if "pos" in token:
|
||||||
lemmas.append(token.get("lemma", ""))
|
pos.append(token["pos"])
|
||||||
heads.append(token.get("head", 0) + sent_start_i + i)
|
if "morph" in token:
|
||||||
labels.append(token.get("dep", ""))
|
morphs.append(token["morph"])
|
||||||
|
if "lemma" in token:
|
||||||
|
lemmas.append(token["lemma"])
|
||||||
|
if "head" in token:
|
||||||
|
heads.append(token["head"] + sent_start_i + i)
|
||||||
|
if "dep" in token:
|
||||||
|
labels.append(token["dep"])
|
||||||
# Ensure ROOT label is case-insensitive
|
# Ensure ROOT label is case-insensitive
|
||||||
if labels[-1].lower() == "root":
|
if labels[-1].lower() == "root":
|
||||||
labels[-1] = "ROOT"
|
labels[-1] = "ROOT"
|
||||||
|
@ -130,15 +136,24 @@ def json_to_annotations(doc):
|
||||||
ids=ids,
|
ids=ids,
|
||||||
words=words,
|
words=words,
|
||||||
spaces=spaces,
|
spaces=spaces,
|
||||||
tags=tags,
|
|
||||||
pos=pos,
|
|
||||||
morphs=morphs,
|
|
||||||
lemmas=lemmas,
|
|
||||||
heads=heads,
|
|
||||||
deps=labels,
|
|
||||||
sent_starts=sent_starts,
|
sent_starts=sent_starts,
|
||||||
brackets=brackets
|
brackets=brackets
|
||||||
)
|
)
|
||||||
|
# avoid including dummy values that looks like gold info was present
|
||||||
|
if tags:
|
||||||
|
example["token_annotation"]["tags"] = tags
|
||||||
|
if pos:
|
||||||
|
example["token_annotation"]["pos"] = pos
|
||||||
|
if morphs:
|
||||||
|
example["token_annotation"]["morphs"] = morphs
|
||||||
|
if lemmas:
|
||||||
|
example["token_annotation"]["lemmas"] = lemmas
|
||||||
|
if heads:
|
||||||
|
example["token_annotation"]["heads"] = heads
|
||||||
|
if labels:
|
||||||
|
example["token_annotation"]["deps"] = labels
|
||||||
|
if pos:
|
||||||
|
example["token_annotation"]["pos"] = pos
|
||||||
|
|
||||||
cats = {}
|
cats = {}
|
||||||
for cat in paragraph.get("cats", {}):
|
for cat in paragraph.get("cats", {}):
|
||||||
|
|
|
@ -143,8 +143,7 @@ def _has_ner(eg):
|
||||||
def _get_labels(examples):
|
def _get_labels(examples):
|
||||||
labels = set()
|
labels = set()
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
for ner_tag in eg.token_annotation.entities:
|
for ner_tag in eg.get_aligned("ENT_TYPE", as_string=True):
|
||||||
if ner_tag != 'O' and ner_tag != '-':
|
if ner_tag != 'O' and ner_tag != '-':
|
||||||
_, label = ner_tag.split('-', 1)
|
labels.add(ner_tag)
|
||||||
labels.add(label)
|
|
||||||
return list(sorted(labels))
|
return list(sorted(labels))
|
||||||
|
|
|
@ -59,10 +59,10 @@ class Tok2Vec(Pipe):
|
||||||
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
|
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
|
||||||
"""
|
"""
|
||||||
for docs in minibatch(stream, batch_size):
|
for docs in minibatch(stream, batch_size):
|
||||||
batch = list(batch)
|
docs = list(docs)
|
||||||
tokvecses = self.predict(docs)
|
tokvecses = self.predict(docs)
|
||||||
self.set_annotations(docs, tokvecses)
|
self.set_annotations(docs, tokvecses)
|
||||||
yield from batch
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
"""Return a single tensor for a batch of documents.
|
"""Return a single tensor for a batch of documents.
|
||||||
|
|
|
@ -11,6 +11,7 @@ from spacy.util import fix_random_seed
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from spacy.pipeline.defaults import default_tok2vec
|
from spacy.pipeline.defaults import default_tok2vec
|
||||||
|
from ...gold import Example
|
||||||
|
|
||||||
TRAIN_DATA = [
|
TRAIN_DATA = [
|
||||||
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
|
||||||
|
@ -50,21 +51,20 @@ def test_textcat_learns_multilabel():
|
||||||
cats = {letter: float(w2 == letter) for letter in letters}
|
cats = {letter: float(w2 == letter) for letter in letters}
|
||||||
docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
|
docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
|
||||||
random.shuffle(docs)
|
random.shuffle(docs)
|
||||||
model = TextCategorizer(nlp.vocab, width=8)
|
textcat = TextCategorizer(nlp.vocab, width=8)
|
||||||
for letter in letters:
|
for letter in letters:
|
||||||
model.add_label(letter)
|
textcat.add_label(letter)
|
||||||
optimizer = model.begin_training()
|
optimizer = textcat.begin_training()
|
||||||
for i in range(30):
|
for i in range(30):
|
||||||
losses = {}
|
losses = {}
|
||||||
Ys = [GoldParse(doc, cats=cats) for doc, cats in docs]
|
examples = [Example.from_dict(doc, {"cats": cats}) for doc, cat in docs]
|
||||||
Xs = [doc for doc, cats in docs]
|
textcat.update(examples, sgd=optimizer, losses=losses)
|
||||||
model.update(Xs, Ys, sgd=optimizer, losses=losses)
|
|
||||||
random.shuffle(docs)
|
random.shuffle(docs)
|
||||||
for w1 in letters:
|
for w1 in letters:
|
||||||
for w2 in letters:
|
for w2 in letters:
|
||||||
doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
|
doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
|
||||||
truth = {letter: w2 == letter for letter in letters}
|
truth = {letter: w2 == letter for letter in letters}
|
||||||
model(doc)
|
textcat(doc)
|
||||||
for cat, score in doc.cats.items():
|
for cat, score in doc.cats.items():
|
||||||
if not truth[cat]:
|
if not truth[cat]:
|
||||||
assert score < 0.5
|
assert score < 0.5
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
from spacy.pipeline.defaults import default_ner
|
from spacy.pipeline.defaults import default_ner
|
||||||
from spacy.pipeline import EntityRecognizer
|
from spacy.pipeline import EntityRecognizer
|
||||||
|
|
||||||
|
@ -7,6 +9,8 @@ from spacy.lang.en import English
|
||||||
from spacy.tokens import Span
|
from spacy.tokens import Span
|
||||||
|
|
||||||
|
|
||||||
|
# skipped after removing Beam stuff during the Example/GoldParse refactor
|
||||||
|
@pytest.mark.skip
|
||||||
def test_issue4313():
|
def test_issue4313():
|
||||||
""" This should not crash or exit with some strange error code """
|
""" This should not crash or exit with some strange error code """
|
||||||
beam_width = 16
|
beam_width = 16
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from spacy.gold import Example
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
|
||||||
)
|
)
|
||||||
def test_gold_misaligned(en_tokenizer, text, words):
|
def test_gold_misaligned(en_tokenizer, text, words):
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
GoldParse(doc, words=words)
|
Example.from_dict(doc, {"words": words})
|
||||||
|
|
|
@ -90,6 +90,7 @@ def merged_dict():
|
||||||
return {
|
return {
|
||||||
"ids": [1, 2, 3, 4, 5, 6, 7],
|
"ids": [1, 2, 3, 4, 5, 6, 7],
|
||||||
"words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
|
"words": ["Hi", "there", "everyone", "It", "is", "just", "me"],
|
||||||
|
"spaces": [True, True, True, True, True, True, False],
|
||||||
"tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
|
"tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"],
|
||||||
"sent_starts": [1, 0, 0, 1, 0, 0, 0],
|
"sent_starts": [1, 0, 0, 1, 0, 0, 0],
|
||||||
}
|
}
|
||||||
|
@ -150,6 +151,30 @@ def test_gold_biluo_misalign(en_vocab):
|
||||||
assert tags == ["O", "O", "O", "-", "-", "-"]
|
assert tags == ["O", "O", "O", "-", "-", "-"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_sentences(en_vocab):
|
||||||
|
words = ["I", "flew", "to", "San Francisco Valley", "had", "loads of fun"]
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of", "fun"]
|
||||||
|
sent_starts = [True, False, False, False, False, False, True, False, False, False]
|
||||||
|
example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
|
||||||
|
assert example.text == "I flew to San Francisco Valley had loads of fun "
|
||||||
|
split_examples = example.split_sents()
|
||||||
|
assert len(split_examples) == 2
|
||||||
|
assert split_examples[0].text == "I flew to San Francisco Valley "
|
||||||
|
assert split_examples[1].text == "had loads of fun "
|
||||||
|
|
||||||
|
words = ["I", "flew", "to", "San", "Francisco", "Valley", "had", "loads", "of fun"]
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
gold_words = ["I", "flew", "to", "San Francisco", "Valley", "had", "loads of", "fun"]
|
||||||
|
sent_starts = [True, False, False, False, False, True, False, False]
|
||||||
|
example = Example.from_dict(doc, {"words": gold_words, "sent_starts": sent_starts})
|
||||||
|
assert example.text == "I flew to San Francisco Valley had loads of fun "
|
||||||
|
split_examples = example.split_sents()
|
||||||
|
assert len(split_examples) == 2
|
||||||
|
assert split_examples[0].text == "I flew to San Francisco Valley "
|
||||||
|
assert split_examples[1].text == "had loads of fun "
|
||||||
|
|
||||||
|
|
||||||
def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
|
||||||
# one-to-many
|
# one-to-many
|
||||||
words = ["I", "flew to", "San Francisco Valley", "."]
|
words = ["I", "flew to", "San Francisco Valley", "."]
|
||||||
|
@ -466,7 +491,7 @@ def _train(train_data):
|
||||||
def test_split_sents(merged_dict):
|
def test_split_sents(merged_dict):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
example = Example.from_dict(
|
example = Example.from_dict(
|
||||||
Doc(nlp.vocab, words=merged_dict["words"]),
|
Doc(nlp.vocab, words=merged_dict["words"], spaces=merged_dict["spaces"]),
|
||||||
merged_dict
|
merged_dict
|
||||||
)
|
)
|
||||||
assert len(get_parses_from_example(
|
assert len(get_parses_from_example(
|
||||||
|
@ -484,6 +509,8 @@ def test_split_sents(merged_dict):
|
||||||
|
|
||||||
split_examples = example.split_sents()
|
split_examples = example.split_sents()
|
||||||
assert len(split_examples) == 2
|
assert len(split_examples) == 2
|
||||||
|
assert split_examples[0].text == "Hi there everyone "
|
||||||
|
assert split_examples[1].text == "It is just me"
|
||||||
|
|
||||||
token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
|
token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
|
||||||
assert token_annotation_1["words"] == ["Hi", "there", "everyone"]
|
assert token_annotation_1["words"] == ["Hi", "there", "everyone"]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user