From 7ec710af0ea0f7ef84ca1fce644a8e8fc6176709 Mon Sep 17 00:00:00 2001 From: Kevin Gao Date: Tue, 17 Jan 2017 10:35:55 -0800 Subject: [PATCH] Fix Custom Tokenizer docs - Fix mismatched quotations - Make it more clear where ORTH, LEMMA, and POS symbols come from - Make strings consistent - Fix lemma_ assertion s/-PRON-/me/ --- website/docs/usage/customizing-tokenizer.jade | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index c1a03a14a..d43fb438f 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -26,6 +26,9 @@ p | #[+api("tokenizer") #[code Tokenizer]] instance: +code. + import spacy + from spacy.symbols import ORTH, LEMMA, POS + nlp = spacy.load('en') assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that'] nlp.tokenizer.add_special_case(u'gimme', @@ -37,7 +40,7 @@ p { ORTH: u'me'}]) assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] - assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that'] + assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] p | The special case doesn't have to match an entire whitespace-delimited @@ -52,9 +55,9 @@ p | The special case rules have precedence over the punctuation splitting: +code. - nlp.tokenizer.add_special_case(u"...gimme...?", + nlp.tokenizer.add_special_case(u'...gimme...?', [{ - ORTH: u'...gimme...?", LEMMA: "give", TAG: "VB"}]) + ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) assert len(nlp(u'...gimme...?')) == 1 p