mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-13 08:34:57 +03:00
Add state tests, esp. for split function
This commit is contained in:
parent
e826b85cf0
commit
e0375132bd
68
spacy/tests/parser/test_split_word.py
Normal file
68
spacy/tests/parser/test_split_word.py
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ...tokens.doc import Doc
|
||||||
|
from ...vocab import Vocab
|
||||||
|
from ...syntax.stateclass import StateClass
|
||||||
|
|
||||||
|
|
||||||
|
def get_doc(words, vocab=None):
|
||||||
|
if vocab is None:
|
||||||
|
vocab = Vocab()
|
||||||
|
return Doc(vocab, words=list(words))
|
||||||
|
|
||||||
|
def test_push():
|
||||||
|
'''state.push_stack() should take the first word in the queue (aka buffer)
|
||||||
|
and put it on the stack, popping that word from the queue.'''
|
||||||
|
doc = get_doc('abcd')
|
||||||
|
state = StateClass(doc)
|
||||||
|
assert state.get_B(0) == 0
|
||||||
|
state.push_stack()
|
||||||
|
assert state.get_B(0) == 1
|
||||||
|
|
||||||
|
def test_pop():
|
||||||
|
'''state.pop_stack() should remove the top word from the stack.'''
|
||||||
|
doc = get_doc('abcd')
|
||||||
|
state = StateClass(doc)
|
||||||
|
assert state.get_B(0) == 0
|
||||||
|
state.push_stack()
|
||||||
|
state.push_stack()
|
||||||
|
assert state.get_S(0) == 1
|
||||||
|
assert state.get_S(1) == 0
|
||||||
|
state.pop_stack()
|
||||||
|
assert state.get_S(0) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def toy_split():
|
||||||
|
def _realloc(data, new_size):
|
||||||
|
additions = new_size - len(data)
|
||||||
|
return data + ['']*additions
|
||||||
|
length = 10
|
||||||
|
sent = list(range(length))
|
||||||
|
sent = [None]*pad + sent + [None]*pad # pad
|
||||||
|
ptr = pad
|
||||||
|
i = 5
|
||||||
|
n = 2
|
||||||
|
|
||||||
|
ptr -= pad
|
||||||
|
i += pad
|
||||||
|
sent = _realloc(sent, length+n+(pad*2))
|
||||||
|
n_moved = (length + (pad*2)) - i+1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_split():
|
||||||
|
'''state.split_token should take the ith word of the buffer, and split it
|
||||||
|
into n+1 pieces. n is 0-indexed, i.e. split(i, 0) is a noop, and split(i, 1)
|
||||||
|
creates 1 new token.'''
|
||||||
|
doc = get_doc('abcd')
|
||||||
|
state = StateClass(doc)
|
||||||
|
assert len(state) == len(doc)
|
||||||
|
state.split_token(1, 2)
|
||||||
|
assert len(state) == len(doc)+2
|
||||||
|
stdoc = state.get_doc(doc.vocab)
|
||||||
|
assert stdoc[0].text == 'a'
|
||||||
|
assert stdoc[1].text == 'b'
|
||||||
|
assert stdoc[2].text == 'b'
|
||||||
|
assert stdoc[3].text == 'b'
|
||||||
|
assert stdoc[4].text == 'c'
|
||||||
|
assert stdoc[5].text == 'd'
|
Loading…
Reference in New Issue
Block a user