mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Merge pull request #1352 from hscspring/patch-5
Update customizing-tokenizer.jade
This commit is contained in:
commit
9177313063
|
@ -40,7 +40,9 @@ p
|
||||||
{
|
{
|
||||||
ORTH: u'me'}])
|
ORTH: u'me'}])
|
||||||
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
|
assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that']
|
||||||
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'me', u'that']
|
# Pronoun lemma is returned as -PRON-
|
||||||
|
# More details please see: https://spacy.io/docs/usage/troubleshooting#pron-lemma
|
||||||
|
assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that']
|
||||||
|
|
||||||
p
|
p
|
||||||
| The special case doesn't have to match an entire whitespace-delimited
|
| The special case doesn't have to match an entire whitespace-delimited
|
||||||
|
@ -172,12 +174,14 @@ p
|
||||||
|
|
||||||
prefix_re = re.compile(r'''[\[\("']''')
|
prefix_re = re.compile(r'''[\[\("']''')
|
||||||
suffix_re = re.compile(r'''[\]\)"']''')
|
suffix_re = re.compile(r'''[\]\)"']''')
|
||||||
|
infix_re = re.compile(r'''[-~]''')
|
||||||
def create_tokenizer(nlp):
|
def create_tokenizer(nlp):
|
||||||
return Tokenizer(nlp.vocab,
|
return Tokenizer(nlp.vocab, rules={},
|
||||||
prefix_search=prefix_re.search,
|
prefix_search=prefix_re.search,
|
||||||
suffix_search=suffix_re.search)
|
suffix_search=suffix_re.search,
|
||||||
|
infix_finditer=infix_re.finditer)
|
||||||
|
|
||||||
nlp = spacy.load('en', tokenizer=create_make_doc)
|
nlp = spacy.load('en', create_make_doc=create_tokenizer)
|
||||||
|
|
||||||
p
|
p
|
||||||
| If you need to subclass the tokenizer instead, the relevant methods to
|
| If you need to subclass the tokenizer instead, the relevant methods to
|
||||||
|
|
Loading…
Reference in New Issue
Block a user