Add state tests, esp. for split function

2026-01-22 08:14:18 +03:00 · 2018-03-30 13:25:46 +02:00 · 2018-03-30 13:25:46 +02:00 · e0375132bd
commit e0375132bd
parent e826b85cf0
1 changed files with 68 additions and 0 deletions
--- a/spacy/tests/parser/test_split_word.py
+++ b/spacy/tests/parser/test_split_word.py
@ -0,0 +1,68 @@
+import pytest
+
+from ...tokens.doc import Doc
+from ...vocab import Vocab
+from ...syntax.stateclass import StateClass
+
+
+def get_doc(words, vocab=None):
+    if vocab is None:
+        vocab = Vocab()
+    return Doc(vocab, words=list(words))
+
+def test_push():
+    '''state.push_stack() should take the first word in the queue (aka buffer)
+    and put it on the stack, popping that word from the queue.'''
+    doc = get_doc('abcd')
+    state = StateClass(doc)
+    assert state.get_B(0) == 0
+    state.push_stack()
+    assert state.get_B(0) == 1
+
+def test_pop():
+    '''state.pop_stack() should remove the top word from the stack.'''
+    doc = get_doc('abcd')
+    state = StateClass(doc)
+    assert state.get_B(0) == 0
+    state.push_stack()
+    state.push_stack()
+    assert state.get_S(0) == 1
+    assert state.get_S(1) == 0
+    state.pop_stack()
+    assert state.get_S(0) == 0
+
+
+def toy_split():
+    def _realloc(data, new_size):
+        additions = new_size - len(data)
+        return data + ['']*additions
+    length = 10
+    sent = list(range(length))
+    sent = [None]*pad + sent + [None]*pad # pad
+    ptr = pad
+    i = 5
+    n = 2
+
+    ptr -= pad
+    i += pad
+    sent = _realloc(sent, length+n+(pad*2))
+    n_moved = (length + (pad*2)) - i+1
+
+
+
+def test_split():
+    '''state.split_token should take the ith word of the buffer, and split it
+    into n+1 pieces. n is 0-indexed, i.e. split(i, 0) is a noop, and split(i, 1)
+    creates 1 new token.'''
+    doc = get_doc('abcd')
+    state = StateClass(doc)
+    assert len(state) == len(doc)
+    state.split_token(1, 2)
+    assert len(state) == len(doc)+2
+    stdoc = state.get_doc(doc.vocab)
+    assert stdoc[0].text == 'a'
+    assert stdoc[1].text == 'b'
+    assert stdoc[2].text == 'b'
+    assert stdoc[3].text == 'b'
+    assert stdoc[4].text == 'c'
+    assert stdoc[5].text == 'd'