From 2ef170a991acda127b624fab27eb619379799899 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 16 Apr 2015 04:28:06 +0200 Subject: [PATCH] * Fix Issue #54: Error merging multi-word token when there's a mid-token match. --- spacy/tokens.pyx | 2 ++ tests/test_merge.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index b61302c24..27d99a045 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -281,6 +281,8 @@ cdef class Tokens: if self.data[i].idx == start_idx: start = i if (self.data[i].idx + self.data[i].lex.length) == end_idx: + if start == -1: + return None end = i + 1 break else: diff --git a/tests/test_merge.py b/tests/test_merge.py index 39693b178..370a334b8 100644 --- a/tests/test_merge.py +++ b/tests/test_merge.py @@ -30,3 +30,9 @@ def test_merge_heads(): assert tokens[3].head.i == 1 assert tokens[4].head.i in [1, 3] assert tokens[5].head.i == 4 + + +def test_issue_54(): + text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).' + tokens = NLU(text, merge_mwes=True) +