adjust whitespace tokenizer to avoid sep in split()

This commit is contained in:
svlandeg 2021-06-16 10:58:45 +02:00
parent b09be3e1cb
commit 29d83dec0c
2 changed files with 18 additions and 1 deletions

View File

@ -419,6 +419,23 @@ def test_language_from_config_before_after_init_invalid():
English.from_config(config)
def test_language_whitespace_tokenizer():
"""Test the custom whitespace tokenizer from the docs."""
class WhitespaceTokenizer:
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = text.split()
return Doc(self.vocab, words=words)
nlp = spacy.blank("en")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
doc = nlp("What's happened to me? he thought. It wasn't a dream. ")
assert doc
def test_language_custom_tokenizer():
"""Test that a fully custom tokenizer can be plugged in via the registry."""
name = "test_language_custom_tokenizer"

View File

@ -1168,7 +1168,7 @@ class WhitespaceTokenizer:
self.vocab = vocab
def __call__(self, text):
words = text.split(" ")
words = text.split()
return Doc(self.vocab, words=words)
nlp = spacy.blank("en")