From 9df5a429a6708151f1ccb5373dc718a7995d2058 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 27 Oct 2019 16:34:35 +0100 Subject: [PATCH] Revert "[#4529] fix: gold pyx (#4530)" This reverts commit fcd25db033976da00af0e765a53df389856cf6a8. --- spacy/gold.pyx | 8 +++----- spacy/tests/regression/test_issue4529.py | 13 ------------- 2 files changed, 3 insertions(+), 18 deletions(-) delete mode 100644 spacy/tests/regression/test_issue4529.py diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 817b059ce..19a464523 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -647,9 +647,9 @@ cdef class GoldParse: if morphology is None: morphology = [None for _ in words] if entities is None: - entities = ["-" for _ in words] + entities = ["-" for _ in doc] elif len(entities) == 0: - entities = ["O" for _ in words] + entities = ["O" for _ in doc] else: # Translate the None values to '-', to make processing easier. # See Issue #2603 @@ -712,9 +712,7 @@ cdef class GoldParse: self.heads[i] = i+1 self.labels[i] = "subtok" else: - head_i = heads[i2j_multi[i]] - if head_i: - self.heads[i] = self.gold_to_cand[head_i] + self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]] self.labels[i] = deps[i2j_multi[i]] # Now set NER...This is annoying because if we've split # got an entity word split into two, we need to adjust the diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py deleted file mode 100644 index 381957be6..000000000 --- a/spacy/tests/regression/test_issue4529.py +++ /dev/null @@ -1,13 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import pytest -from spacy.gold import GoldParse - - -@pytest.mark.parametrize( - "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])] -) -def test_gold_misaligned(en_tokenizer, text, words): - doc = en_tokenizer(text) - GoldParse(doc, words=words)