Revert "[#4529] fix: gold pyx (#4530)"

This reverts commit fcd25db033.
2025-10-02 09:56:39 +03:00 · 2019-10-27 16:34:35 +01:00 · 2019-10-27 16:34:35 +01:00 · 9df5a429a6
commit 9df5a429a6
parent 8e7414dace
2 changed files with 3 additions and 18 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -647,9 +647,9 @@ cdef class GoldParse:
            if morphology is None:
                morphology = [None for _ in words]
            if entities is None:
-                entities = ["-" for _ in words]
+                entities = ["-" for _ in doc]
            elif len(entities) == 0:
-                entities = ["O" for _ in words]
+                entities = ["O" for _ in doc]
            else:
                # Translate the None values to '-', to make processing easier.
                # See Issue #2603
@ -712,9 +712,7 @@ cdef class GoldParse:
                            self.heads[i] = i+1
                            self.labels[i] = "subtok"
                        else:
-                            head_i = heads[i2j_multi[i]]
+                            self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
                            if head_i:
                                self.heads[i] = self.gold_to_cand[head_i]
                            self.labels[i] = deps[i2j_multi[i]]
                        # Now set NER...This is annoying because if we've split
                        # got an entity word split into two, we need to adjust the
--- a/spacy/tests/regression/test_issue4529.py
+++ b/spacy/tests/regression/test_issue4529.py
@ -1,13 +0,0 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
 from spacy.gold import GoldParse
@pytest.mark.parametrize(
    "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
 )
 def test_gold_misaligned(en_tokenizer, text, words):
    doc = en_tokenizer(text)
    GoldParse(doc, words=words)