mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-12 17:22:25 +03:00
extend example to ensure the text is preserved
This commit is contained in:
parent
29d83dec0c
commit
bb9d2f1546
|
@ -427,13 +427,27 @@ def test_language_whitespace_tokenizer():
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
words = text.split()
|
words = text.split(" ")
|
||||||
return Doc(self.vocab, words=words)
|
spaces = [True] * len(words)
|
||||||
|
# Avoid zero-length tokens
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
if word == "":
|
||||||
|
words[i] = " "
|
||||||
|
spaces[i] = False
|
||||||
|
# Remove the final trailing space
|
||||||
|
if words[-1] == " ":
|
||||||
|
words = words[0:-1]
|
||||||
|
spaces = spaces[0:-1]
|
||||||
|
else:
|
||||||
|
spaces[-1] = False
|
||||||
|
|
||||||
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
nlp = spacy.blank("en")
|
nlp = spacy.blank("en")
|
||||||
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
|
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
|
||||||
doc = nlp("What's happened to me? he thought. It wasn't a dream. ")
|
text = " What's happened to me? he thought. It wasn't a dream. "
|
||||||
assert doc
|
doc = nlp(text)
|
||||||
|
assert doc.text == text
|
||||||
|
|
||||||
|
|
||||||
def test_language_custom_tokenizer():
|
def test_language_custom_tokenizer():
|
||||||
|
|
|
@ -1168,8 +1168,21 @@ class WhitespaceTokenizer:
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
words = text.split()
|
words = text.split(" ")
|
||||||
return Doc(self.vocab, words=words)
|
spaces = [True] * len(words)
|
||||||
|
# Avoid zero-length tokens
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
if word == "":
|
||||||
|
words[i] = " "
|
||||||
|
spaces[i] = False
|
||||||
|
# Remove the final trailing space
|
||||||
|
if words[-1] == " ":
|
||||||
|
words = words[0:-1]
|
||||||
|
spaces = spaces[0:-1]
|
||||||
|
else:
|
||||||
|
spaces[-1] = False
|
||||||
|
|
||||||
|
return Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
nlp = spacy.blank("en")
|
nlp = spacy.blank("en")
|
||||||
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
|
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user