From 3d4bd96e8a36814132e99874c9b461f0639f2fd5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 2 Nov 2016 20:41:43 +0100 Subject: [PATCH] Fix infixes in french --- spacy/fr/language_data.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/fr/language_data.py b/spacy/fr/language_data.py index f43bc3681..417cd9828 100644 --- a/spacy/fr/language_data.py +++ b/spacy/fr/language_data.py @@ -114,7 +114,10 @@ _ '''.strip().split('\n') -TOKENIZER_INFIXES = tuple() +TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) ''' + r'''(?<=[a-zA-Z])--(?=[a-zA-z]) (?<=[0-9])-(?=[0-9]) ''' + r'''(?<=[A-Za-z]),(?=[A-Za-z])''').split() + TOKENIZER_EXCEPTIONS = {