mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Example Dict format consistency (#5858)
* consistently use upper-case IDS in token_annotation format and for get_aligned * remove ID from to_dict (not used in from_dict either) * fix test Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
fa79a0db9f
commit
34873c4911
|
@ -183,15 +183,15 @@ cdef class Example:
|
|||
"links": self._links_to_dict()
|
||||
},
|
||||
"token_annotation": {
|
||||
"ids": [t.i+1 for t in self.reference],
|
||||
"words": [t.text for t in self.reference],
|
||||
"tags": [t.tag_ for t in self.reference],
|
||||
"lemmas": [t.lemma_ for t in self.reference],
|
||||
"pos": [t.pos_ for t in self.reference],
|
||||
"morphs": [t.morph_ for t in self.reference],
|
||||
"heads": [t.head.i for t in self.reference],
|
||||
"deps": [t.dep_ for t in self.reference],
|
||||
"sent_starts": [int(bool(t.is_sent_start)) for t in self.reference]
|
||||
"ORTH": [t.text for t in self.reference],
|
||||
"SPACY": [bool(t.whitespace_) for t in self.reference],
|
||||
"TAG": [t.tag_ for t in self.reference],
|
||||
"LEMMA": [t.lemma_ for t in self.reference],
|
||||
"POS": [t.pos_ for t in self.reference],
|
||||
"MORPH": [t.morph_ for t in self.reference],
|
||||
"HEAD": [t.head.i for t in self.reference],
|
||||
"DEP": [t.dep_ for t in self.reference],
|
||||
"SENT_START": [int(bool(t.is_sent_start)) for t in self.reference]
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -335,10 +335,14 @@ def _fix_legacy_dict_data(example_dict):
|
|||
for key, value in old_token_dict.items():
|
||||
if key in ("text", "ids", "brackets"):
|
||||
pass
|
||||
elif key in remapping.values():
|
||||
token_dict[key] = value
|
||||
elif key.lower() in remapping:
|
||||
token_dict[remapping[key.lower()]] = value
|
||||
else:
|
||||
raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=remapping.keys()))
|
||||
all_keys = set(remapping.values())
|
||||
all_keys.update(remapping.keys())
|
||||
raise KeyError(Errors.E983.format(key=key, dict="token_annotation", keys=all_keys))
|
||||
text = example_dict.get("text", example_dict.get("raw"))
|
||||
if _has_field(token_dict, "ORTH") and not _has_field(token_dict, "SPACY"):
|
||||
token_dict["SPACY"] = _guess_spaces(text, token_dict["ORTH"])
|
||||
|
|
|
@ -108,7 +108,7 @@ class SentenceRecognizer(Tagger):
|
|||
truths = []
|
||||
for eg in examples:
|
||||
eg_truth = []
|
||||
for x in eg.get_aligned("sent_start"):
|
||||
for x in eg.get_aligned("SENT_START"):
|
||||
if x is None:
|
||||
eg_truth.append(None)
|
||||
elif x == 1:
|
||||
|
|
|
@ -259,7 +259,7 @@ class Tagger(Pipe):
|
|||
DOCS: https://spacy.io/api/tagger#get_loss
|
||||
"""
|
||||
loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
|
||||
truths = [eg.get_aligned("tag", as_string=True) for eg in examples]
|
||||
truths = [eg.get_aligned("TAG", as_string=True) for eg in examples]
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
if self.model.ops.xp.isnan(loss):
|
||||
raise ValueError("nan value when computing loss")
|
||||
|
|
|
@ -646,14 +646,14 @@ def test_split_sents(merged_dict):
|
|||
assert split_examples[1].text == "It is just me"
|
||||
|
||||
token_annotation_1 = split_examples[0].to_dict()["token_annotation"]
|
||||
assert token_annotation_1["words"] == ["Hi", "there", "everyone"]
|
||||
assert token_annotation_1["tags"] == ["INTJ", "ADV", "PRON"]
|
||||
assert token_annotation_1["sent_starts"] == [1, 0, 0]
|
||||
assert token_annotation_1["ORTH"] == ["Hi", "there", "everyone"]
|
||||
assert token_annotation_1["TAG"] == ["INTJ", "ADV", "PRON"]
|
||||
assert token_annotation_1["SENT_START"] == [1, 0, 0]
|
||||
|
||||
token_annotation_2 = split_examples[1].to_dict()["token_annotation"]
|
||||
assert token_annotation_2["words"] == ["It", "is", "just", "me"]
|
||||
assert token_annotation_2["tags"] == ["PRON", "AUX", "ADV", "PRON"]
|
||||
assert token_annotation_2["sent_starts"] == [1, 0, 0, 0]
|
||||
assert token_annotation_2["ORTH"] == ["It", "is", "just", "me"]
|
||||
assert token_annotation_2["TAG"] == ["PRON", "AUX", "ADV", "PRON"]
|
||||
assert token_annotation_2["SENT_START"] == [1, 0, 0, 0]
|
||||
|
||||
|
||||
def test_alignment():
|
||||
|
@ -723,4 +723,4 @@ def test_retokenized_docs(doc):
|
|||
retokenizer.merge(doc1[0:2])
|
||||
retokenizer.merge(doc1[5:7])
|
||||
|
||||
assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.']
|
||||
assert example.get_aligned("ORTH", as_string=True) == [None, 'sister', 'flew', 'to', None, 'via', 'London', '.']
|
|
@ -42,7 +42,7 @@ def test_Example_from_dict_with_tags(pred_words, annots):
|
|||
example = Example.from_dict(predicted, annots)
|
||||
for i, token in enumerate(example.reference):
|
||||
assert token.tag_ == annots["tags"][i]
|
||||
aligned_tags = example.get_aligned("tag", as_string=True)
|
||||
aligned_tags = example.get_aligned("TAG", as_string=True)
|
||||
assert aligned_tags == ["NN" for _ in predicted]
|
||||
|
||||
|
||||
|
@ -53,9 +53,13 @@ def test_aligned_tags():
|
|||
annots = {"words": gold_words, "tags": gold_tags}
|
||||
vocab = Vocab()
|
||||
predicted = Doc(vocab, words=pred_words)
|
||||
example = Example.from_dict(predicted, annots)
|
||||
aligned_tags = example.get_aligned("tag", as_string=True)
|
||||
assert aligned_tags == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
|
||||
example1 = Example.from_dict(predicted, annots)
|
||||
aligned_tags1 = example1.get_aligned("TAG", as_string=True)
|
||||
assert aligned_tags1 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
|
||||
# ensure that to_dict works correctly
|
||||
example2 = Example.from_dict(predicted, example1.to_dict())
|
||||
aligned_tags2 = example2.get_aligned("TAG", as_string=True)
|
||||
assert aligned_tags2 == ["VERB", "DET", "NOUN", "SCONJ", "PRON", "VERB", "VERB"]
|
||||
|
||||
|
||||
def test_aligned_tags_multi():
|
||||
|
@ -66,7 +70,7 @@ def test_aligned_tags_multi():
|
|||
vocab = Vocab()
|
||||
predicted = Doc(vocab, words=pred_words)
|
||||
example = Example.from_dict(predicted, annots)
|
||||
aligned_tags = example.get_aligned("tag", as_string=True)
|
||||
aligned_tags = example.get_aligned("TAG", as_string=True)
|
||||
assert aligned_tags == [None, None, "SCONJ", "PRON", "VERB", "VERB"]
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user