Update customizing-tokenizer.jade

update some codes:    
- `me` -> `-PRON`
- `TAG` -> `POS`
- `create_tokenizer` function
This commit is contained in:
Yam 2017-09-22 10:53:22 +08:00 committed by GitHub
parent ea2732469b
commit 6f450306c3

View File

@ -40,7 +40,9 @@ p
{ {
ORTH: u'me'}]) ORTH: u'me'}])
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that'] # Pronoun lemma is returned as -PRON-
# More details please see: https://spacy.io/docs/usage/troubleshooting#pron-lemma
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that']
p p
| The special case doesn't have to match an entire whitespace-delimited | The special case doesn't have to match an entire whitespace-delimited
@ -57,7 +59,7 @@ p
+code. +code.
nlp.tokenizer.add_special_case(u'...gimme...?', nlp.tokenizer.add_special_case(u'...gimme...?',
[{ [{
ORTH: u'...gimme...?', LEMMA: u'give', TAG: u'VB'}]) ORTH: u'...gimme...?', LEMMA: u'give', POS: u'VB'}])
assert len(nlp(u'...gimme...?')) == 1 assert len(nlp(u'...gimme...?')) == 1
p p
@ -172,12 +174,14 @@ p
prefix_re = re.compile(r'''[\[\("']''') prefix_re = re.compile(r'''[\[\("']''')
suffix_re = re.compile(r'''[\]\)"']''') suffix_re = re.compile(r'''[\]\)"']''')
infix_re = re.compile(r'''[-~]''')
def create_tokenizer(nlp): def create_tokenizer(nlp):
return Tokenizer(nlp.vocab, return Tokenizer(nlp.vocab, rules={},
prefix_search=prefix_re.search, prefix_search=prefix_re.search,
suffix_search=suffix_re.search) suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer)
nlp = spacy.load('en', tokenizer=create_make_doc) nlp = spacy.load('en', create_make_doc=create_tokenizer)
p p
| If you need to subclass the tokenizer instead, the relevant methods to | If you need to subclass the tokenizer instead, the relevant methods to