mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-05 14:10:34 +03:00
Document Tokenizer(token_match) and clarify tokenizer_pseudo_code
Closes #835 In the `tokenizer_pseudo_code` I put the `special_cases` kwarg before `find_prefix` because this now matches the order the args are used in the pseudocode, and it also matches spacy's actual code.
This commit is contained in:
parent
2f8d535f65
commit
b6ebedd09c
|
@ -87,8 +87,8 @@ p
|
||||||
| algorithm in Python, optimized for readability rather than performance:
|
| algorithm in Python, optimized for readability rather than performance:
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
def tokenizer_pseudo_code(text, find_prefix, find_suffix,
|
def tokenizer_pseudo_code(text, special_cases,
|
||||||
find_infixes, special_cases):
|
find_prefix, find_suffix, find_infixes):
|
||||||
tokens = []
|
tokens = []
|
||||||
for substring in text.split(' '):
|
for substring in text.split(' '):
|
||||||
suffixes = []
|
suffixes = []
|
||||||
|
@ -140,7 +140,7 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| Let's imagine you wanted to create a tokenizer for a new language. There
|
| Let's imagine you wanted to create a tokenizer for a new language. There
|
||||||
| are four things you would need to define:
|
| are five things you would need to define:
|
||||||
|
|
||||||
+list("numbers")
|
+list("numbers")
|
||||||
+item
|
+item
|
||||||
|
@ -162,6 +162,11 @@ p
|
||||||
| A function #[code infixes_finditer], to handle non-whitespace
|
| A function #[code infixes_finditer], to handle non-whitespace
|
||||||
| separators, such as hyphens etc.
|
| separators, such as hyphens etc.
|
||||||
|
|
||||||
|
+item
|
||||||
|
| (Optional) A boolean function #[code token_match] matching strings
|
||||||
|
| that should never be split, overriding the previous rules.
|
||||||
|
| Useful for things like URLs or numbers.
|
||||||
|
|
||||||
p
|
p
|
||||||
| You shouldn't usually need to create a #[code Tokenizer] subclass.
|
| You shouldn't usually need to create a #[code Tokenizer] subclass.
|
||||||
| Standard usage is to use #[code re.compile()] to build a regular
|
| Standard usage is to use #[code re.compile()] to build a regular
|
||||||
|
@ -175,11 +180,15 @@ p
|
||||||
prefix_re = re.compile(r'''[\[\("']''')
|
prefix_re = re.compile(r'''[\[\("']''')
|
||||||
suffix_re = re.compile(r'''[\]\)"']''')
|
suffix_re = re.compile(r'''[\]\)"']''')
|
||||||
infix_re = re.compile(r'''[-~]''')
|
infix_re = re.compile(r'''[-~]''')
|
||||||
|
simple_url_re = re.compile(r'''^https?://''')
|
||||||
def create_tokenizer(nlp):
|
def create_tokenizer(nlp):
|
||||||
return Tokenizer(nlp.vocab, rules={},
|
return Tokenizer(nlp.vocab,
|
||||||
|
rules={},
|
||||||
prefix_search=prefix_re.search,
|
prefix_search=prefix_re.search,
|
||||||
suffix_search=suffix_re.search,
|
suffix_search=suffix_re.search,
|
||||||
infix_finditer=infix_re.finditer)
|
infix_finditer=infix_re.finditer,
|
||||||
|
token_match=simple_url_re.match
|
||||||
|
)
|
||||||
|
|
||||||
nlp = spacy.load('en', create_make_doc=create_tokenizer)
|
nlp = spacy.load('en', create_make_doc=create_tokenizer)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user