mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Have tokenizer emit tokens for whitespace other than single spaces
This commit is contained in:
parent
43743a5d63
commit
99f5e59286
|
@ -67,12 +67,13 @@ cdef class Language:
|
|||
cdef Tokens tokens = Tokens(length)
|
||||
if length == 0:
|
||||
return tokens
|
||||
cdef int start = 0
|
||||
cdef int i = 0
|
||||
cdef int start = 0
|
||||
cdef Py_UNICODE* chars = string
|
||||
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
||||
cdef String span
|
||||
for i in range(length):
|
||||
if Py_UNICODE_ISSPACE(chars[i]) == 1:
|
||||
for i in range(1, length):
|
||||
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||
if start < i:
|
||||
string_slice(&span, chars, start, i)
|
||||
lexemes = <LexemeC**>self.cache.get(span.key)
|
||||
|
@ -80,7 +81,10 @@ cdef class Language:
|
|||
tokens.extend(start, lexemes, 0)
|
||||
else:
|
||||
self._tokenize(tokens, &span, start, i)
|
||||
start = i + 1
|
||||
in_ws = not in_ws
|
||||
start = i
|
||||
if chars[i] == ' ':
|
||||
start += 1
|
||||
i += 1
|
||||
if start < i:
|
||||
string_slice(&span, chars, start, i)
|
||||
|
|
38
tests/test_whitespace.py
Normal file
38
tests/test_whitespace.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
"""Test that tokens are created correctly for whitespace."""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en import EN
|
||||
import pytest
|
||||
|
||||
|
||||
def test_single_space():
|
||||
tokens = EN.tokenize('hello possums')
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
def test_double_space():
|
||||
tokens = EN.tokenize('hello possums')
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].string == ' '
|
||||
|
||||
|
||||
def test_newline():
|
||||
tokens = EN.tokenize('hello\npossums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_space():
|
||||
tokens = EN.tokenize('hello \npossums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_double_space():
|
||||
tokens = EN.tokenize('hello \npossums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_space_wrap():
|
||||
tokens = EN.tokenize('hello \n possums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user