[#4529] fix: gold pyx (#4530)

* fix: gold pyx

* remove print

* skip test in python2

* Add unicode declarations and don't skip test on Python 2
This commit is contained in:
tamuhey 2019-10-27 21:50:07 +09:00 committed by Matthew Honnibal
parent bddfbc7e1b
commit fcd25db033
2 changed files with 18 additions and 3 deletions

View File

@ -636,9 +636,9 @@ cdef class GoldParse:
if morphology is None: if morphology is None:
morphology = [None for _ in words] morphology = [None for _ in words]
if entities is None: if entities is None:
entities = ["-" for _ in doc] entities = ["-" for _ in words]
elif len(entities) == 0: elif len(entities) == 0:
entities = ["O" for _ in doc] entities = ["O" for _ in words]
else: else:
# Translate the None values to '-', to make processing easier. # Translate the None values to '-', to make processing easier.
# See Issue #2603 # See Issue #2603
@ -701,7 +701,9 @@ cdef class GoldParse:
self.heads[i] = i+1 self.heads[i] = i+1
self.labels[i] = "subtok" self.labels[i] = "subtok"
else: else:
self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]] head_i = heads[i2j_multi[i]]
if head_i:
self.heads[i] = self.gold_to_cand[head_i]
self.labels[i] = deps[i2j_multi[i]] self.labels[i] = deps[i2j_multi[i]]
# Now set NER...This is annoying because if we've split # Now set NER...This is annoying because if we've split
# got an entity word split into two, we need to adjust the # got an entity word split into two, we need to adjust the

View File

@ -0,0 +1,13 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.gold import GoldParse
@pytest.mark.parametrize(
"text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
)
def test_gold_misaligned(en_tokenizer, text, words):
doc = en_tokenizer(text)
GoldParse(doc, words=words)