From a338c6f8f64906f9f4c8b109779a11b717c4792e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Dec 2018 10:41:24 +0100 Subject: [PATCH] Fix JSON segmentation bug that affected French Fix a bug in the JSON streaming code that GoldCorpus uses. Escaped slashes were being handled incorrectly. This bug caused low scores for French in the early v2.1.0 alphas, because most of the data was not being read in. Fittingly, the document that triggered the bug was a Wikipedia article about Perl. Parsing perl remains difficult! --- spacy/gold.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 8bdd42a83..69e256167 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -346,12 +346,12 @@ def _json_iterate(loc): cdef char close_curly = ord('}') for i in range(len(py_raw)): c = raw[i] - if c == backslash: - escape = True - continue if escape: escape = False continue + if c == backslash: + escape = True + continue if c == quote: inside_string = not inside_string continue