From 440b81bddc24669ffe89ef7501fb8c75f98b60d2 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 8 May 2020 15:10:57 +0200 Subject: [PATCH] Improve exceptions for 'd (would/had) in English (#5379) Instead of treating `'d` in contractions like `I'd` as `would` in all cases in the tokenizer exceptions, leave the tagging and lemmatization up to later components. --- spacy/lang/en/tokenizer_exceptions.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index c45197771..62de81912 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -77,12 +77,12 @@ for pron in ["i", "you", "he", "she", "it", "we", "they"]: _exc[orth + "'d"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, + {ORTH: "'d", NORM: "'d"}, ] _exc[orth + "d"] = [ {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, - {ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, + {ORTH: "d", NORM: "'d"}, ] _exc[orth + "'d've"] = [ @@ -195,7 +195,10 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]: {ORTH: "'d", NORM: "'d"}, ] - _exc[orth + "d"] = [{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "d"}] + _exc[orth + "d"] = [ + {ORTH: orth, LEMMA: word, NORM: word}, + {ORTH: "d", NORM: "'d"} + ] _exc[orth + "'d've"] = [ {ORTH: orth, LEMMA: word, NORM: word},