mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-14 02:02:20 +03:00
modified gold.align to handle space tokens (#4537)
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
f2bfaa1b38
commit
df293f3894
|
@ -132,6 +132,14 @@ def align(tokens_a, tokens_b):
|
||||||
offset_a = offset_b = 0
|
offset_a = offset_b = 0
|
||||||
i += 1
|
i += 1
|
||||||
j += 1
|
j += 1
|
||||||
|
elif a == "":
|
||||||
|
assert offset_a == 0
|
||||||
|
cost += 1
|
||||||
|
i += 1
|
||||||
|
elif b == "":
|
||||||
|
assert offset_b == 0
|
||||||
|
cost += 1
|
||||||
|
j += 1
|
||||||
elif b.startswith(a):
|
elif b.startswith(a):
|
||||||
cost += 1
|
cost += 1
|
||||||
if offset_a == 0:
|
if offset_a == 0:
|
||||||
|
|
|
@ -197,6 +197,7 @@ def test_roundtrip_docs_to_json():
|
||||||
["a", "b", "c", "d"],
|
["a", "b", "c", "d"],
|
||||||
(3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}),
|
(3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}),
|
||||||
),
|
),
|
||||||
|
([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_align(tokens_a, tokens_b, expected):
|
def test_align(tokens_a, tokens_b, expected):
|
||||||
|
@ -205,3 +206,12 @@ def test_align(tokens_a, tokens_b, expected):
|
||||||
# check symmetry
|
# check symmetry
|
||||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
|
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
|
||||||
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
|
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_goldparse_startswith_space(en_tokenizer):
|
||||||
|
text = " a"
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
g = GoldParse(doc, words=["a"], entities=["U-DATE"], deps=["ROOT"], heads=[0])
|
||||||
|
assert g.words == [" ", "a"]
|
||||||
|
assert g.ner == [None, "U-DATE"]
|
||||||
|
assert g.labels == [None, "ROOT"]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user