From 0a6d7ca2006d520883361d9922282679c4d2d6cc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 8 Mar 2017 14:33:32 +0100 Subject: [PATCH] Fix spacing after token_match The boolean flag indicating a space after the token was being set incorrectly after the token_match regex was applied. Fixes #859. --- spacy/tokenizer.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 8f2f111e7..1b74431ff 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -275,7 +275,10 @@ cdef class Tokenizer: if cache_hit: pass elif self.token_match and self.token_match(string): - tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size()) + # We're always saying 'no' to spaces here -- the caller will + # fix up the outermost one, with reference to the original. + # See Issue #859 + tokens.push_back(self.vocab.get(tokens.mem, string), False) else: matches = self.find_infix(string) if not matches: