mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
different handling of space tokens
space tokens are now always attached to the previous non-space token there are two exceptions: leading space tokens are attached to the first following non-space token in input that consists exclusively of space tokens, the last space token is the head of all others.
This commit is contained in:
parent
d328e0b4a8
commit
d99a9cbce9
|
@ -8,6 +8,10 @@ from ..symbols cimport punct
|
||||||
from ..attrs cimport IS_SPACE
|
from ..attrs cimport IS_SPACE
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline bint is_space_token(const TokenC* token) nogil:
|
||||||
|
return Lexeme.c_check_flag(token.lex, IS_SPACE)
|
||||||
|
|
||||||
|
|
||||||
cdef cppclass StateC:
|
cdef cppclass StateC:
|
||||||
int* _stack
|
int* _stack
|
||||||
int* _buffer
|
int* _buffer
|
||||||
|
@ -292,23 +296,88 @@ cdef cppclass StateC:
|
||||||
this._break = src._break
|
this._break = src._break
|
||||||
|
|
||||||
void fast_forward() nogil:
|
void fast_forward() nogil:
|
||||||
while this.buffer_length() == 0 \
|
# while this.buffer_length() == 0 \
|
||||||
or this.stack_depth() == 0 \
|
# or this.stack_depth() == 0 \
|
||||||
or Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE):
|
# or Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE):
|
||||||
if this.buffer_length() == 1 and this.stack_depth() == 0:
|
# if this.buffer_length() == 1 and this.stack_depth() == 0:
|
||||||
this.push()
|
# this.push()
|
||||||
this.pop()
|
# this.pop()
|
||||||
elif this.buffer_length() == 0 and this.stack_depth() == 1:
|
# elif this.buffer_length() == 0 and this.stack_depth() == 1:
|
||||||
this.pop()
|
# this.pop()
|
||||||
elif this.buffer_length() == 0 and this.stack_depth() >= 2:
|
# elif this.buffer_length() == 0 and this.stack_depth() >= 2:
|
||||||
if this.has_head(this.S(0)):
|
# if this.has_head(this.S(0)):
|
||||||
|
# this.pop()
|
||||||
|
# else:
|
||||||
|
# this.unshift()
|
||||||
|
# elif (this.length - this._b_i) >= 1 and this.stack_depth() == 0:
|
||||||
|
# this.push()
|
||||||
|
# elif Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE):
|
||||||
|
# this.add_arc(this.B(0), this.S(0), 0)
|
||||||
|
# this.pop()
|
||||||
|
# else:
|
||||||
|
# break
|
||||||
|
|
||||||
|
# space token attachement policy:
|
||||||
|
# - attach space tokens always to the last preceding real token
|
||||||
|
# - except if it's the beginning of a sentence, then attach to the first following
|
||||||
|
# - boundary case: a document containing multiple space tokens but nothing else,
|
||||||
|
# then make the last space token the head of all others
|
||||||
|
|
||||||
|
while is_space_token(this.B_(0)) \
|
||||||
|
or this.buffer_length() == 0 \
|
||||||
|
or this.stack_depth() == 0:
|
||||||
|
if this.buffer_length() == 0:
|
||||||
|
# remove the last sentence's root from the stack
|
||||||
|
if this.stack_depth() == 1:
|
||||||
this.pop()
|
this.pop()
|
||||||
else:
|
# parser got stuck: reduce stack or unshift
|
||||||
this.unshift()
|
elif this.stack_depth() > 1:
|
||||||
elif (this.length - this._b_i) >= 1 and this.stack_depth() == 0:
|
if this.has_head(this.S(0)):
|
||||||
this.push()
|
this.pop()
|
||||||
elif Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE):
|
else:
|
||||||
this.add_arc(this.B(0), this.S(0), 0)
|
this.unshift()
|
||||||
this.pop()
|
# stack is empty but there is another sentence on the buffer
|
||||||
else:
|
elif (this.length - this._b_i) >= 1:
|
||||||
|
this.push()
|
||||||
|
else: # stack empty and nothing else coming
|
||||||
|
break
|
||||||
|
|
||||||
|
elif is_space_token(this.B_(0)):
|
||||||
|
# the normal case: we're somewhere inside a sentence
|
||||||
|
if this.stack_depth() > 0:
|
||||||
|
# assert not is_space_token(this.S_(0))
|
||||||
|
# attach all coming space tokens to their last preceding
|
||||||
|
# real token (which should be on the top of the stack)
|
||||||
|
while is_space_token(this.B_(0)):
|
||||||
|
this.add_arc(this.S(0),this.B(0),0)
|
||||||
|
this.push()
|
||||||
|
this.pop()
|
||||||
|
# the rare case: we're at the beginning of a document:
|
||||||
|
# space tokens are attached to the first real token on the buffer
|
||||||
|
elif this.stack_depth() == 0:
|
||||||
|
# store all space tokens on the stack until a real token shows up
|
||||||
|
# or the last token on the buffer is reached
|
||||||
|
while is_space_token(this.B_(0)) and this.buffer_length() > 1:
|
||||||
|
this.push()
|
||||||
|
# empty the stack by attaching all space tokens to the
|
||||||
|
# first token on the buffer
|
||||||
|
# boundary case: if all tokens are space tokens, the last one
|
||||||
|
# becomes the head of all others
|
||||||
|
while this.stack_depth() > 0:
|
||||||
|
this.add_arc(this.B(0),this.S(0),0)
|
||||||
|
this.pop()
|
||||||
|
# move the first token onto the stack
|
||||||
|
this.push()
|
||||||
|
|
||||||
|
elif this.stack_depth() == 0:
|
||||||
|
# for one token sentences (?)
|
||||||
|
if this.buffer_length() == 1:
|
||||||
|
this.push()
|
||||||
|
this.pop()
|
||||||
|
# with an empty stack and a non-empty buffer
|
||||||
|
# only shift is valid anyway
|
||||||
|
elif (this.length - this._b_i) >= 1:
|
||||||
|
this.push()
|
||||||
|
|
||||||
|
else: # can this even happen?
|
||||||
break
|
break
|
||||||
|
|
|
@ -17,7 +17,7 @@ from libc.string cimport memcpy
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
from ._state cimport StateC
|
from ._state cimport StateC, is_space_token
|
||||||
|
|
||||||
|
|
||||||
DEF NON_MONOTONIC = True
|
DEF NON_MONOTONIC = True
|
||||||
|
@ -166,7 +166,7 @@ cdef class Reduce:
|
||||||
cdef class LeftArc:
|
cdef class LeftArc:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||||
return not st.B_(0).sent_start and not Lexeme.c_check_flag(st.B_(0).lex,IS_SPACE)
|
return not st.B_(0).sent_start
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, int label) nogil:
|
||||||
|
@ -199,7 +199,7 @@ cdef class LeftArc:
|
||||||
cdef class RightArc:
|
cdef class RightArc:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef bint is_valid(const StateC* st, int label) nogil:
|
cdef bint is_valid(const StateC* st, int label) nogil:
|
||||||
return not st.B_(0).sent_start and not Lexeme.c_check_flag(st.S_(0).lex,IS_SPACE)
|
return not st.B_(0).sent_start
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
cdef int transition(StateC* st, int label) nogil:
|
cdef int transition(StateC* st, int label) nogil:
|
||||||
|
@ -233,11 +233,23 @@ cdef class Break:
|
||||||
return False
|
return False
|
||||||
elif st.at_break():
|
elif st.at_break():
|
||||||
return False
|
return False
|
||||||
elif st.B(0) == 0:
|
# unnecessary, since the first item in the buffer is always put onto the stack
|
||||||
return False
|
# automatically by fast_forward() in initialize_state()
|
||||||
|
# elif st.B(0) == 0:
|
||||||
|
# return False
|
||||||
elif st.stack_depth() < 1:
|
elif st.stack_depth() < 1:
|
||||||
return False
|
return False
|
||||||
elif (st.S(0) + 1) != st.B(0):
|
# It is okay to predict a sentence boundary if the top item on the stack
|
||||||
|
# and the first item on the buffer are adjacent tokens. If this is not the
|
||||||
|
# case, it is still okay if there are only space tokens in between.
|
||||||
|
# This is checked by testing whether the head of a space token immediately
|
||||||
|
# preceding the first item in the buffer is the top item on the stack.
|
||||||
|
# Intervening space tokens must be attached to the previous non-space token.
|
||||||
|
# Therefore, if the head of a space token that immediately precedes the first
|
||||||
|
# item on the buffer is the top item on the stack, a sentence boundary can be
|
||||||
|
# predicted.
|
||||||
|
elif (st.S(0) + 1) != st.B(0) \
|
||||||
|
and not (is_space_token(st.safe_get(st.B(0)-1)) and st.H(st.B(0)-1) == st.S(0)):
|
||||||
# Must break at the token boundary
|
# Must break at the token boundary
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -188,9 +188,11 @@ cdef class Parser:
|
||||||
|
|
||||||
action = self.moves.c[guess]
|
action = self.moves.c[guess]
|
||||||
if not eg.is_valid[guess]:
|
if not eg.is_valid[guess]:
|
||||||
with gil:
|
# with gil:
|
||||||
move_name = self.moves.move_name(action.move, action.label)
|
# move_name = self.moves.move_name(action.move, action.label)
|
||||||
return 1
|
# print 'invalid action:', move_name
|
||||||
|
return 1
|
||||||
|
|
||||||
action.do(state, action.label)
|
action.do(state, action.label)
|
||||||
memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class)
|
memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class)
|
||||||
for i in range(eg.nr_class):
|
for i in range(eg.nr_class):
|
||||||
|
@ -275,12 +277,12 @@ cdef class StepwiseState:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def heads(self):
|
def heads(self):
|
||||||
return [self.stcls.H(i) for i in range(self.stcls.length)]
|
return [self.stcls.H(i) for i in range(self.stcls.c.length)]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def deps(self):
|
def deps(self):
|
||||||
return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
|
return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
|
||||||
for i in range(self.stcls.length)]
|
for i in range(self.stcls.c.length)]
|
||||||
|
|
||||||
def predict(self):
|
def predict(self):
|
||||||
self.eg.reset()
|
self.eg.reset()
|
||||||
|
|
|
@ -17,11 +17,11 @@ cdef class StateClass:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def stack(self):
|
def stack(self):
|
||||||
return {self.S(i) for i in range(self._s_i)}
|
return {self.S(i) for i in range(self.c._s_i)}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def queue(self):
|
def queue(self):
|
||||||
return {self.B(i) for i in range(self._b_i)}
|
return {self.B(i) for i in range(self.c._b_i)}
|
||||||
|
|
||||||
def print_state(self, words):
|
def print_state(self, words):
|
||||||
words = list(words) + ['_']
|
words = list(words) + ['_']
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,3 +8,14 @@ def test_root(EN):
|
||||||
tokens = EN(u"i don't have other assistance")
|
tokens = EN(u"i don't have other assistance")
|
||||||
for t in tokens:
|
for t in tokens:
|
||||||
assert t.dep != 0, t.orth_
|
assert t.dep != 0, t.orth_
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_one_word_sentence(EN):
|
||||||
|
# one word sentence
|
||||||
|
doc = EN.tokenizer.tokens_from_list(['Hello'])
|
||||||
|
EN.tagger(doc)
|
||||||
|
assert len(doc) == 1
|
||||||
|
with EN.parser.step_through(doc) as _:
|
||||||
|
pass
|
||||||
|
assert doc[0].dep != 0
|
||||||
|
|
|
@ -3,6 +3,7 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_single_period(EN):
|
def test_single_period(EN):
|
||||||
string = 'A test sentence.'
|
string = 'A test sentence.'
|
||||||
|
@ -37,3 +38,85 @@ def test_single_question(EN):
|
||||||
assert len(words) == 4
|
assert len(words) == 4
|
||||||
assert len(list(words.sents)) == 1
|
assert len(list(words.sents)) == 1
|
||||||
assert sum(len(sent) for sent in words.sents) == len(words)
|
assert sum(len(sent) for sent in words.sents) == len(words)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_sentence_breaks_no_space(EN):
|
||||||
|
doc = EN.tokenizer.tokens_from_list('This is a sentence . This is another one .'.split(' '))
|
||||||
|
EN.tagger(doc)
|
||||||
|
with EN.parser.step_through(doc) as stepwise:
|
||||||
|
# stack empty, automatic Shift (This)
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('L-nsubj') # attach This
|
||||||
|
# stack empty, automatic Shift (is)
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('S') # shift a
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('L-det') # attach a
|
||||||
|
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('R-attr') # attach sentence
|
||||||
|
stepwise.transition('D') # remove sentence
|
||||||
|
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('R-punct') # attach .
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('B-ROOT') # set sentence start on This
|
||||||
|
# automatic reduction of the stack, automatic Shift to start second sentence
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('L-nsubj') # attach This
|
||||||
|
# stack empty, automatic Shift (is)
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('S') # shift another
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('L-attr') # attach another
|
||||||
|
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('R-attr') # attach one
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('D') # remove one
|
||||||
|
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('R-punct') # attach .
|
||||||
|
# buffer empty, automatic cleanup
|
||||||
|
assert len(list(doc.sents)) == 2
|
||||||
|
for tok in doc:
|
||||||
|
assert tok.dep != 0 or tok.is_space
|
||||||
|
assert [ tok.head.i for tok in doc ] == [1,1,3,1,1,6,6,8,6,6]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_sentence_breaks_with_space(EN):
|
||||||
|
doc = EN.tokenizer.tokens_from_list('\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' '))
|
||||||
|
EN.tagger(doc)
|
||||||
|
with EN.parser.step_through(doc) as stepwise:
|
||||||
|
# stack empty, automatic Shift (This)
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('L-nsubj') # attach This
|
||||||
|
# stack empty, automatic Shift (is)
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('S') # shift a
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('L-det') # attach a
|
||||||
|
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('R-attr') # attach sentence
|
||||||
|
stepwise.transition('D') # remove sentence
|
||||||
|
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('R-punct') # attach .
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('B-ROOT') # set sentence start on This
|
||||||
|
# automatic reduction of the stack, automatic Shift to start second sentence
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('L-nsubj') # attach This
|
||||||
|
# stack empty, automatic Shift (is)
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('S') # shift another
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('L-attr') # attach another
|
||||||
|
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('R-attr') # attach one
|
||||||
|
assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('D') # remove one
|
||||||
|
assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
|
||||||
|
stepwise.transition('R-punct') # attach .
|
||||||
|
# buffer empty, automatic cleanup
|
||||||
|
assert len(list(doc.sents)) == 2
|
||||||
|
for tok in doc:
|
||||||
|
assert tok.dep != 0 or tok.is_space
|
||||||
|
assert [ tok.head.i for tok in doc ] == [1,2,2,2,5,2,5,5,2,8,8,8,13,13,16,14,13,13]
|
||||||
|
|
|
@ -4,6 +4,12 @@ import pytest
|
||||||
import numpy
|
import numpy
|
||||||
from spacy.attrs import HEAD
|
from spacy.attrs import HEAD
|
||||||
|
|
||||||
|
def make_doc(EN, sentstr):
|
||||||
|
sent = sentstr.split(' ')
|
||||||
|
doc = EN.tokenizer.tokens_from_list(sent)
|
||||||
|
EN.tagger(doc)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_space_attachment(EN):
|
def test_space_attachment(EN):
|
||||||
|
@ -22,3 +28,63 @@ def test_sentence_space(EN):
|
||||||
doc = EN(text)
|
doc = EN(text)
|
||||||
assert len(list(doc.sents)) == 2
|
assert len(list(doc.sents)) == 2
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_space_attachment_leading_space(EN):
|
||||||
|
# leading space token
|
||||||
|
doc = make_doc(EN, '\t \n This is a sentence .')
|
||||||
|
assert doc[0].is_space
|
||||||
|
assert doc[1].is_space
|
||||||
|
assert doc[2].orth_ == 'This'
|
||||||
|
with EN.parser.step_through(doc) as stepwise:
|
||||||
|
pass
|
||||||
|
assert doc[0].head.i == 2
|
||||||
|
assert doc[1].head.i == 2
|
||||||
|
assert stepwise.stack == set([2])
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_space_attachment_intermediate_and_trailing_space(EN):
|
||||||
|
# intermediate and trailing space tokens
|
||||||
|
doc = make_doc(EN, 'This is \t a \t\n \n sentence . \n\n \n')
|
||||||
|
assert doc[2].is_space
|
||||||
|
assert doc[4].is_space
|
||||||
|
assert doc[5].is_space
|
||||||
|
assert doc[8].is_space
|
||||||
|
assert doc[9].is_space
|
||||||
|
with EN.parser.step_through(doc) as stepwise:
|
||||||
|
stepwise.transition('L-nsubj')
|
||||||
|
stepwise.transition('S')
|
||||||
|
stepwise.transition('L-det')
|
||||||
|
stepwise.transition('R-attr')
|
||||||
|
stepwise.transition('D')
|
||||||
|
stepwise.transition('R-punct')
|
||||||
|
assert stepwise.stack == set([])
|
||||||
|
for tok in doc:
|
||||||
|
assert tok.dep != 0 or tok.is_space
|
||||||
|
assert [ tok.head.i for tok in doc ] == [1,1,1,6,3,3,1,1,7,7]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_space_attachment_one_space_sentence(EN):
|
||||||
|
# one space token sentence
|
||||||
|
doc = make_doc(EN, '\n')
|
||||||
|
assert len(doc) == 1
|
||||||
|
with EN.parser.step_through(doc) as _:
|
||||||
|
pass
|
||||||
|
assert doc[0].is_space
|
||||||
|
assert doc[0].head.i == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.models
|
||||||
|
def test_space_attachment_only_space_sentence(EN):
|
||||||
|
# space-exclusive sentence
|
||||||
|
doc = make_doc(EN, '\n \t \n\n \t')
|
||||||
|
assert len(doc) == 4
|
||||||
|
for tok in doc:
|
||||||
|
assert tok.is_space
|
||||||
|
with EN.parser.step_through(doc) as _:
|
||||||
|
pass
|
||||||
|
# all tokens are attached to the last one
|
||||||
|
for tok in doc:
|
||||||
|
assert tok.head.i == 3
|
||||||
|
|
Loading…
Reference in New Issue
Block a user