mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
modified gold.align to handle space tokens (#4537)
Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
f2bfaa1b38
commit
df293f3894
|
@ -132,6 +132,14 @@ def align(tokens_a, tokens_b):
|
|||
offset_a = offset_b = 0
|
||||
i += 1
|
||||
j += 1
|
||||
elif a == "":
|
||||
assert offset_a == 0
|
||||
cost += 1
|
||||
i += 1
|
||||
elif b == "":
|
||||
assert offset_b == 0
|
||||
cost += 1
|
||||
j += 1
|
||||
elif b.startswith(a):
|
||||
cost += 1
|
||||
if offset_a == 0:
|
||||
|
|
|
@ -197,6 +197,7 @@ def test_roundtrip_docs_to_json():
|
|||
["a", "b", "c", "d"],
|
||||
(3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}),
|
||||
),
|
||||
([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
|
||||
],
|
||||
)
|
||||
def test_align(tokens_a, tokens_b, expected):
|
||||
|
@ -205,3 +206,12 @@ def test_align(tokens_a, tokens_b, expected):
|
|||
# check symmetry
|
||||
cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
|
||||
assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
|
||||
|
||||
|
||||
def test_goldparse_startswith_space(en_tokenizer):
|
||||
text = " a"
|
||||
doc = en_tokenizer(text)
|
||||
g = GoldParse(doc, words=["a"], entities=["U-DATE"], deps=["ROOT"], heads=[0])
|
||||
assert g.words == [" ", "a"]
|
||||
assert g.ner == [None, "U-DATE"]
|
||||
assert g.labels == [None, "ROOT"]
|
||||
|
|
Loading…
Reference in New Issue
Block a user