correct silly typo

This commit is contained in:
svlandeg 2020-06-17 14:48:14 +02:00
parent f6c451b650
commit 1a151b10d6
11 changed files with 32 additions and 28 deletions

View File

@ -148,7 +148,7 @@ def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None):
sentence = {} sentence = {}
tokens = [] tokens = []
token_annotation = example_dict["token_annotation"] token_annotation = example_dict["token_annotation"]
for i, id_ in enumerate(["ids"]): for i, id_ in enumerate(token_annotation["ids"]):
token = {} token = {}
token["id"] = id_ token["id"] = id_
token["orth"] = token_annotation["words"][i] token["orth"] = token_annotation["words"][i]

View File

@ -485,12 +485,12 @@ def _compile_gold(examples, pipeline, nlp):
"texts": set(), "texts": set(),
} }
for example in examples: for example in examples:
gold = example.gold gold = example.reference
doc = example.doc doc = example.predicted
valid_words = [x for x in gold.words if x is not None] valid_words = [x for x in gold if x is not None]
data["words"].update(valid_words) data["words"].update(valid_words)
data["n_words"] += len(valid_words) data["n_words"] += len(valid_words)
data["n_misaligned_words"] += len(gold.words) - len(valid_words) data["n_misaligned_words"] += len(gold) - len(valid_words)
data["texts"].add(doc.text) data["texts"].add(doc.text)
if len(nlp.vocab.vectors): if len(nlp.vocab.vectors):
for word in valid_words: for word in valid_words:
@ -545,10 +545,10 @@ def _format_labels(labels, counts=False):
def _get_examples_without_label(data, label): def _get_examples_without_label(data, label):
count = 0 count = 0
for ex in data: for eg in data:
labels = [ labels = [
label.split("-")[1] label.split("-")[1]
for label in ex.gold.ner for label in eg.gold.ner
if label not in ("O", "-", None) if label not in ("O", "-", None)
] ]
if label not in labels: if label not in labels:

View File

@ -56,8 +56,10 @@ cdef class Example:
if "ORTH" not in tok_dict: if "ORTH" not in tok_dict:
tok_dict["ORTH"] = [tok.text for tok in predicted] tok_dict["ORTH"] = [tok.text for tok in predicted]
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
print("added ORTH and SPACY to the tok_dict")
if "SPACY" not in tok_dict: if "SPACY" not in tok_dict:
tok_dict["SPACY"] = None tok_dict["SPACY"] = None
print("added SPACY to the tok_dict")
return Example( return Example(
predicted, predicted,
annotations2doc(predicted.vocab, tok_dict, doc_dict) annotations2doc(predicted.vocab, tok_dict, doc_dict)
@ -75,13 +77,15 @@ cdef class Example:
def get_aligned(self, field, as_string=False): def get_aligned(self, field, as_string=False):
"""Return an aligned array for a token attribute.""" """Return an aligned array for a token attribute."""
# TODO: This is probably wrong. I just bashed this out and there's probably
# all sorts of edge-cases.
alignment = self.alignment alignment = self.alignment
i2j_multi = alignment.i2j_multi i2j_multi = alignment.i2j_multi
j2i_multi = alignment.j2i_multi j2i_multi = alignment.j2i_multi
gold_to_cand = alignment.gold_to_cand gold_to_cand = alignment.gold_to_cand
cand_to_gold = alignment.cand_to_gold cand_to_gold = alignment.cand_to_gold
print("i2j_multi", i2j_multi)
print("j2i_multi", j2i_multi)
print("gold_to_cand", gold_to_cand)
print("cand_to_gold", cand_to_gold)
vocab = self.reference.vocab vocab = self.reference.vocab
gold_values = self.reference.to_array([field]) gold_values = self.reference.to_array([field])
@ -97,6 +101,7 @@ cdef class Example:
else: else:
output[i] = gold_values[gold_i] output[i] = gold_values[gold_i]
print("output before:" , output)
if field in ["ENT_IOB"]: if field in ["ENT_IOB"]:
# Fix many-to-one IOB codes # Fix many-to-one IOB codes
prev_j = -1 prev_j = -1
@ -111,17 +116,23 @@ cdef class Example:
prev_j = -1 prev_j = -1
prev_value = value prev_value = value
print("output in between:" , output)
if field in ["ENT_IOB", "ENT_TYPE"]: if field in ["ENT_IOB", "ENT_TYPE"]:
# Assign one-to-many NER tags # Assign one-to-many NER tags
for j, cand_j in enumerate(gold_to_cand): for j, cand_j in enumerate(gold_to_cand):
print()
print("j", j)
print("cand_j", cand_j)
if cand_j is None: if cand_j is None:
if j in j2i_multi: if j in j2i_multi:
i = j2i_multi[j] i = j2i_multi[j]
if output[i] is None: if output[i] is None:
output[i] = gold_values[j] output[i] = gold_values[j]
print("output final:" , output)
if as_string: if as_string:
output = [vocab.strings[o] if o is not None else o for o in output] output = [vocab.strings[o] if o is not None else o for o in output]
print("output as string:" , output)
return output return output
def to_dict(self): def to_dict(self):

View File

@ -1,7 +1,6 @@
import pytest import pytest
from thinc.api import Adam, NumpyOps from thinc.api import Adam, NumpyOps
from spacy.attrs import NORM from spacy.attrs import NORM
from spacy.gold import GoldParse
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.pipeline.defaults import default_parser, default_ner from spacy.pipeline.defaults import default_parser, default_ner

View File

@ -4,7 +4,6 @@ from spacy.vocab import Vocab
from spacy.pipeline.defaults import default_parser from spacy.pipeline.defaults import default_parser
from spacy.pipeline import DependencyParser from spacy.pipeline import DependencyParser
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.gold import GoldParse
from spacy.syntax.nonproj import projectivize from spacy.syntax.nonproj import projectivize
from spacy.syntax.stateclass import StateClass from spacy.syntax.stateclass import StateClass
from spacy.syntax.arc_eager import ArcEager from spacy.syntax.arc_eager import ArcEager

View File

@ -5,7 +5,6 @@ from spacy.pipeline.defaults import default_ner
from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.syntax.ner import BiluoPushDown from spacy.syntax.ner import BiluoPushDown
from spacy.gold import GoldParse
from spacy.tokens import Doc from spacy.tokens import Doc
from ..util import make_tempdir from ..util import make_tempdir

View File

@ -4,7 +4,6 @@ from spacy.vocab import Vocab
from spacy.syntax.arc_eager import ArcEager from spacy.syntax.arc_eager import ArcEager
from spacy.syntax.nn_parser import Parser from spacy.syntax.nn_parser import Parser
from spacy.tokens.doc import Doc from spacy.tokens.doc import Doc
from spacy.gold import GoldParse
from thinc.api import Model from thinc.api import Model

View File

@ -6,9 +6,7 @@ from spacy.pipeline.defaults import default_parser
from spacy.pipeline import DependencyParser from spacy.pipeline import DependencyParser
from spacy.syntax.arc_eager import ArcEager from spacy.syntax.arc_eager import ArcEager
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.syntax._beam_utils import ParserBeam
from spacy.syntax.stateclass import StateClass from spacy.syntax.stateclass import StateClass
from spacy.gold import GoldParse
@pytest.fixture @pytest.fixture

View File

@ -7,7 +7,6 @@ from spacy.lang.en import English
from spacy.language import Language from spacy.language import Language
from spacy.pipeline import TextCategorizer from spacy.pipeline import TextCategorizer
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.gold import GoldParse
from spacy.util import fix_random_seed from spacy.util import fix_random_seed
from ..util import make_tempdir from ..util import make_tempdir

View File

@ -3,7 +3,7 @@ import gc
import numpy import numpy
import copy import copy
from spacy.gold import Example, TokenAnnotation from spacy.gold import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.lex_attrs import is_stop from spacy.lang.lex_attrs import is_stop
@ -268,20 +268,21 @@ def test_issue1963(en_tokenizer):
assert doc.tensor.shape == (3, 128) assert doc.tensor.shape == (3, 128)
# TODO: fix
@pytest.mark.parametrize("label", ["U-JOB-NAME"]) @pytest.mark.parametrize("label", ["U-JOB-NAME"])
def test_issue1967(label): def test_issue1967(label):
config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0} config = {"learn_tokens": False, "min_action_freq": 30, "beam_width": 1, "beam_update_prob": 1.0}
ner = EntityRecognizer(Vocab(), default_ner(), **config) ner = EntityRecognizer(Vocab(), default_ner(), **config)
example = Example( example = Example.from_dict(
doc=Doc(ner.vocab, words=["word"]), Doc(ner.vocab, words=["word"]),
token_annotation=TokenAnnotation( {
ids=[0], "ids": [0],
words=["word"], "words": ["word"],
tags=["tag"], "tags": ["tag"],
heads=[0], "heads": [0],
deps=["dep"], "deps": ["dep"],
entities=[label] "entities": [label]
) }
) )
ner.moves.get_actions(gold_parses=[example]) ner.moves.get_actions(gold_parses=[example])

View File

@ -1,5 +1,4 @@
import pytest import pytest
from spacy.gold import GoldParse
@pytest.mark.parametrize( @pytest.mark.parametrize(