mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge pull request #8405 from svlandeg/fix/whitespace_tokenizer [ci skip]
This commit is contained in:
commit
af9d984407
|
@ -421,6 +421,37 @@ def test_language_from_config_before_after_init_invalid():
|
|||
English.from_config(config)
|
||||
|
||||
|
||||
def test_language_whitespace_tokenizer():
|
||||
"""Test the custom whitespace tokenizer from the docs."""
|
||||
|
||||
class WhitespaceTokenizer:
|
||||
def __init__(self, vocab):
|
||||
self.vocab = vocab
|
||||
|
||||
def __call__(self, text):
|
||||
words = text.split(" ")
|
||||
spaces = [True] * len(words)
|
||||
# Avoid zero-length tokens
|
||||
for i, word in enumerate(words):
|
||||
if word == "":
|
||||
words[i] = " "
|
||||
spaces[i] = False
|
||||
# Remove the final trailing space
|
||||
if words[-1] == " ":
|
||||
words = words[0:-1]
|
||||
spaces = spaces[0:-1]
|
||||
else:
|
||||
spaces[-1] = False
|
||||
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
|
||||
text = " What's happened to me? he thought. It wasn't a dream. "
|
||||
doc = nlp(text)
|
||||
assert doc.text == text
|
||||
|
||||
|
||||
def test_language_custom_tokenizer():
|
||||
"""Test that a fully custom tokenizer can be plugged in via the registry."""
|
||||
name = "test_language_custom_tokenizer"
|
||||
|
|
|
@ -1169,7 +1169,20 @@ class WhitespaceTokenizer:
|
|||
|
||||
def __call__(self, text):
|
||||
words = text.split(" ")
|
||||
return Doc(self.vocab, words=words)
|
||||
spaces = [True] * len(words)
|
||||
# Avoid zero-length tokens
|
||||
for i, word in enumerate(words):
|
||||
if word == "":
|
||||
words[i] = " "
|
||||
spaces[i] = False
|
||||
# Remove the final trailing space
|
||||
if words[-1] == " ":
|
||||
words = words[0:-1]
|
||||
spaces = spaces[0:-1]
|
||||
else:
|
||||
spaces[-1] = False
|
||||
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
nlp = spacy.blank("en")
|
||||
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
|
||||
|
|
Loading…
Reference in New Issue
Block a user