mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
Add docs note on custom tokenizer rules (see #1491)
This commit is contained in:
parent
5e7d98f72a
commit
2639ecd5f8
|
@ -198,11 +198,11 @@ p
|
||||||
| #[code .finditer()] methods:
|
| #[code .finditer()] methods:
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
import re
|
import regex as re
|
||||||
from spacy.tokenizer import Tokenizer
|
from spacy.tokenizer import Tokenizer
|
||||||
|
|
||||||
prefix_re = re.compile(r'''[\[\("']''')
|
prefix_re = re.compile(r'''^[\[\("']''')
|
||||||
suffix_re = re.compile(r'''[\]\)"']''')
|
suffix_re = re.compile(r'''[\]\)"']$''')
|
||||||
infix_re = re.compile(r'''[-~]''')
|
infix_re = re.compile(r'''[-~]''')
|
||||||
simple_url_re = re.compile(r'''^https?://''')
|
simple_url_re = re.compile(r'''^https?://''')
|
||||||
|
|
||||||
|
@ -220,6 +220,17 @@ p
|
||||||
| specialize are #[code find_prefix], #[code find_suffix] and
|
| specialize are #[code find_prefix], #[code find_suffix] and
|
||||||
| #[code find_infix].
|
| #[code find_infix].
|
||||||
|
|
||||||
|
+infobox("Important note", "⚠️")
|
||||||
|
| When customising the prefix, suffix and infix handling, remember that
|
||||||
|
| you're passing in #[strong functions] for spaCy to execute, e.g.
|
||||||
|
| #[code prefix_re.search] – not just the regular expressions. This means
|
||||||
|
| that your functions also need to define how the rules should be applied.
|
||||||
|
| For example, if you're adding your own prefix rules, you need
|
||||||
|
| to make sure they're only applied to characters at the
|
||||||
|
| #[strong beginning of a token], e.g. by adding #[code ^]. Similarly,
|
||||||
|
| suffix rules should only be applied at the #[strong end of a token],
|
||||||
|
| so your expression should end with a #[code $].
|
||||||
|
|
||||||
+h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
|
+h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
Loading…
Reference in New Issue
Block a user