adjust whitespace tokenizer to avoid sep in split()

2025-07-16 03:02:41 +03:00 · 2021-06-16 10:58:45 +02:00 · 2021-06-16 10:58:45 +02:00 · 29d83dec0c
commit 29d83dec0c
parent b09be3e1cb
2 changed files with 18 additions and 1 deletions
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -419,6 +419,23 @@ def test_language_from_config_before_after_init_invalid():
            English.from_config(config)


+def test_language_whitespace_tokenizer():
+    """Test the custom whitespace tokenizer from the docs."""
+
+    class WhitespaceTokenizer:
+        def __init__(self, vocab):
+            self.vocab = vocab
+
+        def __call__(self, text):
+            words = text.split()
+            return Doc(self.vocab, words=words)
+
+    nlp = spacy.blank("en")
+    nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
+    doc = nlp("What's happened to    me? he thought. It wasn't a dream. ")
+    assert doc
+
+
 def test_language_custom_tokenizer():
    """Test that a fully custom tokenizer can be plugged in via the registry."""
    name = "test_language_custom_tokenizer"
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -1168,7 +1168,7 @@ class WhitespaceTokenizer:
        self.vocab = vocab

    def __call__(self, text):
-        words = text.split(" ")
+        words = text.split()
        return Doc(self.vocab, words=words)

 nlp = spacy.blank("en")