[#4529] fix: gold pyx (#4530)

* fix: gold pyx * remove print * skip test in python2 * Add unicode declarations and don't skip test on Python 2
2025-11-01 00:17:44 +03:00 · 2019-10-27 21:50:07 +09:00 · 2019-10-27 21:50:07 +09:00 · fcd25db033
commit fcd25db033
parent bddfbc7e1b
2 changed files with 18 additions and 3 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -636,9 +636,9 @@ cdef class GoldParse:
            if morphology is None:
                morphology = [None for _ in words]
            if entities is None:
-                entities = ["-" for _ in doc]
+                entities = ["-" for _ in words]
            elif len(entities) == 0:
-                entities = ["O" for _ in doc]
+                entities = ["O" for _ in words]
            else:
                # Translate the None values to '-', to make processing easier.
                # See Issue #2603
@ -701,7 +701,9 @@ cdef class GoldParse:
                            self.heads[i] = i+1
                            self.labels[i] = "subtok"
                        else:
-                            self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
+                            head_i = heads[i2j_multi[i]]
+                            if head_i:
+                                self.heads[i] = self.gold_to_cand[head_i]
                            self.labels[i] = deps[i2j_multi[i]]
                        # Now set NER...This is annoying because if we've split
                        # got an entity word split into two, we need to adjust the
--- a/spacy/tests/regression/test_issue4529.py
+++ b/spacy/tests/regression/test_issue4529.py
@ -0,0 +1,13 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.gold import GoldParse
+
+
+@pytest.mark.parametrize(
+    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
+)
+def test_gold_misaligned(en_tokenizer, text, words):
+    doc = en_tokenizer(text)
+    GoldParse(doc, words=words)