* Fix runtime error bug that arose from updated Span.root function.

2025-12-03 00:06:02 +03:00 · 2016-01-25 15:22:42 +01:00 · 2016-01-25 15:22:42 +01:00 · 87172a15c6
commit 87172a15c6
parent 2c8dd91785
2 changed files with 23 additions and 6 deletions
--- a/spacy/tests/tokens/test_tokens_api.py
+++ b/spacy/tests/tokens/test_tokens_api.py
@ -161,3 +161,21 @@ def test_merge_hang():
    doc.from_array([HEAD], heads.T)
    doc.merge(18, 32, '', '', 'ORG')
    doc.merge(8, 32, '', '', 'ORG')
+
+
+@pytest.mark.models
+def test_runtime_error(EN):
+    # Example that caused run-time error while parsing Reddit
+    text = u'67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school'
+    doc = EN(text)
+    nps = []
+    for np in doc.noun_chunks:
+        while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'):
+            np = np[1:]
+        if len(np) > 1:
+            nps.append((np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_))
+    for np in nps:
+        print(np)
+        for word in doc:
+            print(word.idx, word.text, word.head.i, word.head.text)
+        doc.merge(*np)
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -190,12 +190,6 @@ cdef class Span:
            for i in range(self.start, self.end):
                if self.start <= (i+self.doc.c[i].head) < self.end:
                    continue
-                # Don't allow punctuation or spaces to be the root, if there are
-                # better candidates
-                if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_PUNCT):
-                    continue
-                if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_SPACE):
-                    continue
                words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
                if words_to_root < current_best:
                    current_best = words_to_root
@ -244,6 +238,11 @@ cdef class Span:


 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
+    # Don't allow spaces to be the root, if there are
+    # better candidates
+    if Lexeme.c_check_flag(token.lex, IS_SPACE):
+        return sent_length-1
+
    cdef int n = 0
    while token.head != 0:
        token += token.head