mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Example.get_aligned_parse
: Handle unit and zero length vectors correctly (#11026)
* `Example.get_aligned_parse`: Do not squeeze gold token idx vector Correctly handle zero-size vectors passed to `np.vectorize` * Add tests * Use `Doc` ctor to initialize attributes * Remove unintended change Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Remove unused import Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
a9559e7435
commit
1d5cad0b42
|
@ -679,6 +679,31 @@ def test_projectivize(en_tokenizer):
|
||||||
assert proj_heads == [3, 2, 3, 3, 3]
|
assert proj_heads == [3, 2, 3, 3, 3]
|
||||||
assert nonproj_heads == [3, 2, 3, 3, 2]
|
assert nonproj_heads == [3, 2, 3, 3, 2]
|
||||||
|
|
||||||
|
# Test single token documents
|
||||||
|
doc = en_tokenizer("Conrail")
|
||||||
|
heads = [0]
|
||||||
|
deps = ["dep"]
|
||||||
|
example = Example.from_dict(doc, {"heads": heads, "deps": deps})
|
||||||
|
proj_heads, proj_labels = example.get_aligned_parse(projectivize=True)
|
||||||
|
assert proj_heads == heads
|
||||||
|
assert proj_labels == deps
|
||||||
|
|
||||||
|
# Test documents with no alignments
|
||||||
|
doc_a = Doc(
|
||||||
|
doc.vocab, words=["Double-Jointed"], spaces=[False], deps=["ROOT"], heads=[0]
|
||||||
|
)
|
||||||
|
doc_b = Doc(
|
||||||
|
doc.vocab,
|
||||||
|
words=["Double", "-", "Jointed"],
|
||||||
|
spaces=[True, True, True],
|
||||||
|
deps=["amod", "punct", "ROOT"],
|
||||||
|
heads=[2, 2, 2],
|
||||||
|
)
|
||||||
|
example = Example(doc_a, doc_b)
|
||||||
|
proj_heads, proj_deps = example.get_aligned_parse(projectivize=True)
|
||||||
|
assert proj_heads == [None]
|
||||||
|
assert proj_deps == [None]
|
||||||
|
|
||||||
|
|
||||||
def test_iob_to_biluo():
|
def test_iob_to_biluo():
|
||||||
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
|
||||||
|
|
|
@ -249,9 +249,9 @@ cdef class Example:
|
||||||
# Fetch all aligned gold token incides.
|
# Fetch all aligned gold token incides.
|
||||||
if c2g_single_toks.shape == cand_to_gold.lengths.shape:
|
if c2g_single_toks.shape == cand_to_gold.lengths.shape:
|
||||||
# This the most likely case.
|
# This the most likely case.
|
||||||
gold_i = cand_to_gold[:].squeeze()
|
gold_i = cand_to_gold[:]
|
||||||
else:
|
else:
|
||||||
gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0])(c2g_single_toks).squeeze()
|
gold_i = numpy.vectorize(lambda x: cand_to_gold[int(x)][0], otypes='i')(c2g_single_toks)
|
||||||
|
|
||||||
# Fetch indices of all gold heads for the aligned gold tokens.
|
# Fetch indices of all gold heads for the aligned gold tokens.
|
||||||
heads = numpy.asarray(heads, dtype='i')
|
heads = numpy.asarray(heads, dtype='i')
|
||||||
|
@ -261,7 +261,7 @@ cdef class Example:
|
||||||
# gold tokens (and are aligned to a single candidate token).
|
# gold tokens (and are aligned to a single candidate token).
|
||||||
g2c_len_heads = gold_to_cand.lengths[gold_head_i]
|
g2c_len_heads = gold_to_cand.lengths[gold_head_i]
|
||||||
g2c_len_heads = numpy.where(g2c_len_heads == 1)[0]
|
g2c_len_heads = numpy.where(g2c_len_heads == 1)[0]
|
||||||
g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0])(gold_head_i[g2c_len_heads]).squeeze()
|
g2c_i = numpy.vectorize(lambda x: gold_to_cand[int(x)][0], otypes='i')(gold_head_i[g2c_len_heads]).squeeze()
|
||||||
|
|
||||||
# Update head/dep alignments with the above.
|
# Update head/dep alignments with the above.
|
||||||
aligned_heads = numpy.full((self.x.length), None)
|
aligned_heads = numpy.full((self.x.length), None)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user