From 2639ecd5f8f22f75b8d1ab14f550a3914e39f3f4 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 3 Nov 2017 23:33:18 +0100 Subject: [PATCH] Add docs note on custom tokenizer rules (see #1491) --- .../_linguistic-features/_tokenization.jade | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/website/usage/_linguistic-features/_tokenization.jade b/website/usage/_linguistic-features/_tokenization.jade index 182bc31e9..f635e6658 100644 --- a/website/usage/_linguistic-features/_tokenization.jade +++ b/website/usage/_linguistic-features/_tokenization.jade @@ -198,11 +198,11 @@ p | #[code .finditer()] methods: +code. - import re + import regex as re from spacy.tokenizer import Tokenizer - prefix_re = re.compile(r'''[\[\("']''') - suffix_re = re.compile(r'''[\]\)"']''') + prefix_re = re.compile(r'''^[\[\("']''') + suffix_re = re.compile(r'''[\]\)"']$''') infix_re = re.compile(r'''[-~]''') simple_url_re = re.compile(r'''^https?://''') @@ -220,6 +220,17 @@ p | specialize are #[code find_prefix], #[code find_suffix] and | #[code find_infix]. ++infobox("Important note", "⚠️") + | When customising the prefix, suffix and infix handling, remember that + | you're passing in #[strong functions] for spaCy to execute, e.g. + | #[code prefix_re.search] – not just the regular expressions. This means + | that your functions also need to define how the rules should be applied. + | For example, if you're adding your own prefix rules, you need + | to make sure they're only applied to characters at the + | #[strong beginning of a token], e.g. by adding #[code ^]. Similarly, + | suffix rules should only be applied at the #[strong end of a token], + | so your expression should end with a #[code $]. + +h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline p