diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index c2f840a27..173521a33 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -87,8 +87,8 @@ p | algorithm in Python, optimized for readability rather than performance: +code. - def tokenizer_pseudo_code(text, find_prefix, find_suffix, - find_infixes, special_cases): + def tokenizer_pseudo_code(text, special_cases, + find_prefix, find_suffix, find_infixes): tokens = [] for substring in text.split(' '): suffixes = [] @@ -140,7 +140,7 @@ p p | Let's imagine you wanted to create a tokenizer for a new language. There - | are four things you would need to define: + | are five things you would need to define: +list("numbers") +item @@ -162,6 +162,11 @@ p | A function #[code infixes_finditer], to handle non-whitespace | separators, such as hyphens etc. + +item + | (Optional) A boolean function #[code token_match] matching strings + | that should never be split, overriding the previous rules. + | Useful for things like URLs or numbers. + p | You shouldn't usually need to create a #[code Tokenizer] subclass. | Standard usage is to use #[code re.compile()] to build a regular @@ -175,11 +180,15 @@ p prefix_re = re.compile(r'''[\[\("']''') suffix_re = re.compile(r'''[\]\)"']''') infix_re = re.compile(r'''[-~]''') + simple_url_re = re.compile(r'''^https?://''') def create_tokenizer(nlp): - return Tokenizer(nlp.vocab, rules={}, + return Tokenizer(nlp.vocab, + rules={}, prefix_search=prefix_re.search, suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer) + infix_finditer=infix_re.finditer, + token_match=simple_url_re.match + ) nlp = spacy.load('en', create_make_doc=create_tokenizer)