From 2639ecd5f8f22f75b8d1ab14f550a3914e39f3f4 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Fri, 3 Nov 2017 23:33:18 +0100
Subject: [PATCH] Add docs note on custom tokenizer rules (see #1491)

---
 .../_linguistic-features/_tokenization.jade     | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/website/usage/_linguistic-features/_tokenization.jade b/website/usage/_linguistic-features/_tokenization.jade
index 182bc31e9..f635e6658 100644
--- a/website/usage/_linguistic-features/_tokenization.jade
+++ b/website/usage/_linguistic-features/_tokenization.jade
@@ -198,11 +198,11 @@ p
     |  #[code .finditer()] methods:
 
 +code.
-    import re
+    import regex as re
     from spacy.tokenizer import Tokenizer
 
-    prefix_re = re.compile(r'''[\[\(&quot;&apos;]''')
-    suffix_re = re.compile(r'''[\]\)&quot;&apos;]''')
+    prefix_re = re.compile(r'''^[\[\(&quot;&apos;]''')
+    suffix_re = re.compile(r'''[\]\)&quot;&apos;]$''')
     infix_re = re.compile(r'''[-~]''')
     simple_url_re = re.compile(r'''^https?://''')
 
@@ -220,6 +220,17 @@ p
     |  specialize are #[code find_prefix], #[code find_suffix] and
     |  #[code find_infix].
 
++infobox("Important note", "⚠️")
+    |  When customising the prefix, suffix and infix handling, remember that
+    |  you're passing in #[strong functions] for spaCy to execute, e.g.
+    |  #[code prefix_re.search] – not just the regular expressions. This means
+    |  that your functions also need to define how the rules should be applied.
+    |  For example, if you're adding your own prefix rules, you need
+    |  to make sure they're only applied to characters at the
+    |  #[strong beginning of a token], e.g. by adding #[code ^]. Similarly,
+    |  suffix rules should only be applied at the #[strong end of a token],
+    |  so your expression should end with a #[code $].
+
 +h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
 
 p