From 6f450306c3429d19472e7ae25bcbcd7f8b835e2d Mon Sep 17 00:00:00 2001 From: Yam Date: Fri, 22 Sep 2017 10:53:22 +0800 Subject: [PATCH 1/2] Update customizing-tokenizer.jade update some codes: - `me` -> `-PRON` - `TAG` -> `POS` - `create_tokenizer` function --- website/docs/usage/customizing-tokenizer.jade | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index ca5be9ef1..c7f717380 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -40,7 +40,9 @@ p { ORTH: u'me'}]) assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] - assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] + # Pronoun lemma is returned as -PRON- + # More details please see: https://spacy.io/docs/usage/troubleshooting#pron-lemma + assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that'] p | The special case doesn't have to match an entire whitespace-delimited @@ -57,7 +59,7 @@ p +code. nlp.tokenizer.add_special_case(u'...gimme...?', [{ - ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) + ORTH: u'...gimme...?', LEMMA: u'give', POS: u'VB'}]) assert len(nlp(u'...gimme...?')) == 1 p @@ -172,12 +174,14 @@ p prefix_re = re.compile(r'''[\[\("']''') suffix_re = re.compile(r'''[\]\)"']''') + infix_re = re.compile(r'''[-~]''') def create_tokenizer(nlp): - return Tokenizer(nlp.vocab, + return Tokenizer(nlp.vocab, rules={}, prefix_search=prefix_re.search, - suffix_search=suffix_re.search) + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer) - nlp = spacy.load('en', tokenizer=create_make_doc) + nlp = spacy.load('en', create_make_doc=create_tokenizer) p | If you need to subclass the tokenizer instead, the relevant methods to From 54855f0eee6707798caa58d41d192ec4401a5763 Mon Sep 17 00:00:00 2001 From: Yam Date: Fri, 22 Sep 2017 12:15:48 +0800 Subject: [PATCH 2/2] Update customizing-tokenizer.jade --- website/docs/usage/customizing-tokenizer.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index c7f717380..c2f840a27 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -59,7 +59,7 @@ p +code. nlp.tokenizer.add_special_case(u'...gimme...?', [{ - ORTH: u'...gimme...?', LEMMA: u'give', POS: u'VB'}]) + ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) assert len(nlp(u'...gimme...?')) == 1 p