Fix Custom Tokenizer docs

- Fix mismatched quotations - Make it more clear where ORTH, LEMMA, and POS symbols come from - Make strings consistent - Fix lemma_ assertion s/-PRON-/me/
2025-07-15 10:42:34 +03:00 · 2017-01-17 10:35:55 -08:00 · 2017-01-17 10:35:55 -08:00 · 7ec710af0e
commit 7ec710af0e
parent dbe8dafb52
1 changed files with 6 additions and 3 deletions
--- a/website/docs/usage/customizing-tokenizer.jade
+++ b/website/docs/usage/customizing-tokenizer.jade
@ -26,6 +26,9 @@ p
    |  #[+api("tokenizer") #[code Tokenizer]] instance:
 +code.
    import spacy
    from spacy.symbols import ORTH, LEMMA, POS
    nlp = spacy.load('en')
    assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that']
    nlp.tokenizer.add_special_case(u'gimme',
@ -37,7 +40,7 @@ p
            {
                ORTH: u'me'}])
    assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
-    assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that']
+    assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']
 p
    |  The special case doesn't have to match an entire whitespace-delimited
@ -52,9 +55,9 @@ p
    |  The special case rules have precedence over the punctuation splitting:
 +code.
-    nlp.tokenizer.add_special_case(u"...gimme...?",
+    nlp.tokenizer.add_special_case(u'...gimme...?',
        [{
-            ORTH: u'...gimme...?", LEMMA: "give", TAG: "VB"}])
+            ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}])
    assert len(nlp(u'...gimme...?')) == 1
 p