Merge pull request #749 from sudowork/custom-tokenizer-docs

Fix Custom Tokenizer docs
2026-01-10 02:31:16 +03:00 · 2017-01-18 11:39:43 +11:00 · 2017-01-18 11:39:43 +11:00 · 300650a6f8
commit 300650a6f8
parent dbe8dafb52 7ec710af0e
1 changed files with 6 additions and 3 deletions
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@ -26,6 +26,9 @@ p
    |  #[+api("tokenizer") #[code Tokenizer]] instance:

 +code.
+    import spacy
+    from spacy.symbols import ORTH, LEMMA, POS
+
    nlp = spacy.load('en')
    assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that']
    nlp.tokenizer.add_special_case(u'gimme',
@ -37,7 +40,7 @@ p
            {
                ORTH: u'me'}])
    assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
-    assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that']
+    assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']

 p
    |  The special case doesn't have to match an entire whitespace-delimited
@ -52,9 +55,9 @@ p
    |  The special case rules have precedence over the punctuation splitting:

 +code.
-    nlp.tokenizer.add_special_case(u"...gimme...?",
+    nlp.tokenizer.add_special_case(u'...gimme...?',
        [{
-            ORTH: u'...gimme...?", LEMMA: "give", TAG: "VB"}])
+            ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}])
    assert len(nlp(u'...gimme...?')) == 1

 p