modified gold.align to handle space tokens (#4537)

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
tamuhey 2019-10-28 23:44:28 +09:00 committed by Ines Montani
parent f2bfaa1b38
commit df293f3894
2 changed files with 18 additions and 0 deletions

View File

@ -132,6 +132,14 @@ def align(tokens_a, tokens_b):
offset_a = offset_b = 0
i += 1
j += 1
elif a == "":
assert offset_a == 0
cost += 1
i += 1
elif b == "":
assert offset_b == 0
cost += 1
j += 1
elif b.startswith(a):
cost += 1
if offset_a == 0:

View File

@ -197,6 +197,7 @@ def test_roundtrip_docs_to_json():
["a", "b", "c", "d"],
(3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}),
),
([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
],
)
def test_align(tokens_a, tokens_b, expected):
@ -205,3 +206,12 @@ def test_align(tokens_a, tokens_b, expected):
# check symmetry
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
def test_goldparse_startswith_space(en_tokenizer):
text = " a"
doc = en_tokenizer(text)
g = GoldParse(doc, words=["a"], entities=["U-DATE"], deps=["ROOT"], heads=[0])
assert g.words == [" ", "a"]
assert g.ner == [None, "U-DATE"]
assert g.labels == [None, "ROOT"]