From 41a90a7fbb7927f4d0c5a5d22f9669cb91e87347 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 3 Nov 2016 00:03:34 +0100 Subject: [PATCH] Add tokenizer exception for 'Ph.D.', to fix 592. --- spacy/en/language_data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py index a3e63d0c4..504103566 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/language_data.py @@ -112,6 +112,10 @@ TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z] TOKENIZER_EXCEPTIONS = { + "Ph.D.": [ + { + "F": "Ph.D." + }], "d.": [ { "F": "d."