From 29d83dec0c0c53625a0b3cc299c8236bcb0b6e89 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 16 Jun 2021 10:58:45 +0200 Subject: [PATCH] adjust whitespace tokenizer to avoid sep in split() --- spacy/tests/test_language.py | 17 +++++++++++++++++ website/docs/usage/linguistic-features.md | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 86cce5f9e..72d1597fd 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -419,6 +419,23 @@ def test_language_from_config_before_after_init_invalid(): English.from_config(config) +def test_language_whitespace_tokenizer(): + """Test the custom whitespace tokenizer from the docs.""" + + class WhitespaceTokenizer: + def __init__(self, vocab): + self.vocab = vocab + + def __call__(self, text): + words = text.split() + return Doc(self.vocab, words=words) + + nlp = spacy.blank("en") + nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) + doc = nlp("What's happened to me? he thought. It wasn't a dream. ") + assert doc + + def test_language_custom_tokenizer(): """Test that a fully custom tokenizer can be plugged in via the registry.""" name = "test_language_custom_tokenizer" diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 5a1293c2e..7dc6cff25 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1168,7 +1168,7 @@ class WhitespaceTokenizer: self.vocab = vocab def __call__(self, text): - words = text.split(" ") + words = text.split() return Doc(self.vocab, words=words) nlp = spacy.blank("en")