spaCy/spacy/tests/parser/test_split_word.py
Matthew Honnibal e5ad35787c WIP on adding split-token actions to parser
This patch starts getting the StateC object ready to split tokens. The
split function is implemented by pushing indices into the buffer that
indicate an out-of-length token.

Still todo:

* Update the oracles
* Update GoldParseC
* Interpret the parse once it's complete
* Add retokenizer.split() method
2018-03-31 20:05:27 +02:00

44 lines
1.3 KiB
Python

import pytest
from ...tokens.doc import Doc
from ...vocab import Vocab
from ...syntax.stateclass import StateClass
def get_doc(words, vocab=None):
if vocab is None:
vocab = Vocab()
return Doc(vocab, words=list(words))
def test_push():
'''state.push_stack() should take the first word in the queue (aka buffer)
and put it on the stack, popping that word from the queue.'''
doc = get_doc('abcd')
state = StateClass(doc)
assert state.get_B(0) == 0
state.push_stack()
assert state.get_B(0) == 1
def test_pop():
'''state.pop_stack() should remove the top word from the stack.'''
doc = get_doc('abcd')
state = StateClass(doc)
assert state.get_B(0) == 0
state.push_stack()
state.push_stack()
assert state.get_S(0) == 1
assert state.get_S(1) == 0
state.pop_stack()
assert state.get_S(0) == 0
def test_split():
'''state.split_token should take the ith word of the buffer, and split it
into n+1 pieces. n is 0-indexed, i.e. split(i, 0) is a noop, and split(i, 1)
creates 1 new token.'''
doc = get_doc('abcd')
state = StateClass(doc, max_split=3)
assert state.queue == [0, 1, 2, 3]
state.split_token(1, 2, fast_forward=False)
assert state.queue == [0, 1, 1*4+1, 2*4+1, 2, 3]