diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index d75aca966..c911b8d81 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -421,6 +421,37 @@ def test_language_from_config_before_after_init_invalid(): English.from_config(config) +def test_language_whitespace_tokenizer(): + """Test the custom whitespace tokenizer from the docs.""" + + class WhitespaceTokenizer: + def __init__(self, vocab): + self.vocab = vocab + + def __call__(self, text): + words = text.split(" ") + spaces = [True] * len(words) + # Avoid zero-length tokens + for i, word in enumerate(words): + if word == "": + words[i] = " " + spaces[i] = False + # Remove the final trailing space + if words[-1] == " ": + words = words[0:-1] + spaces = spaces[0:-1] + else: + spaces[-1] = False + + return Doc(self.vocab, words=words, spaces=spaces) + + nlp = spacy.blank("en") + nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) + text = " What's happened to me? he thought. It wasn't a dream. " + doc = nlp(text) + assert doc.text == text + + def test_language_custom_tokenizer(): """Test that a fully custom tokenizer can be plugged in via the registry.""" name = "test_language_custom_tokenizer" diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index b05d16da3..42476cd98 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1169,7 +1169,20 @@ class WhitespaceTokenizer: def __call__(self, text): words = text.split(" ") - return Doc(self.vocab, words=words) + spaces = [True] * len(words) + # Avoid zero-length tokens + for i, word in enumerate(words): + if word == "": + words[i] = " " + spaces[i] = False + # Remove the final trailing space + if words[-1] == " ": + words = words[0:-1] + spaces = spaces[0:-1] + else: + spaces[-1] = False + + return Doc(self.vocab, words=words, spaces=spaces) nlp = spacy.blank("en") nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)