mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	different handling of space tokens
space tokens are now always attached to the previous non-space token there are two exceptions: leading space tokens are attached to the first following non-space token in input that consists exclusively of space tokens, the last space token is the head of all others.
This commit is contained in:
		
							parent
							
								
									d328e0b4a8
								
							
						
					
					
						commit
						d99a9cbce9
					
				|  | @ -8,6 +8,10 @@ from ..symbols cimport punct | |||
| from ..attrs cimport IS_SPACE | ||||
| 
 | ||||
| 
 | ||||
| cdef inline bint is_space_token(const TokenC* token) nogil: | ||||
|     return Lexeme.c_check_flag(token.lex, IS_SPACE) | ||||
| 
 | ||||
| 
 | ||||
| cdef cppclass StateC: | ||||
|     int* _stack | ||||
|     int* _buffer | ||||
|  | @ -292,23 +296,88 @@ cdef cppclass StateC: | |||
|         this._break = src._break | ||||
| 
 | ||||
|     void fast_forward() nogil: | ||||
|         while this.buffer_length() == 0 \ | ||||
|         or this.stack_depth() == 0 \ | ||||
|         or Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE): | ||||
|             if this.buffer_length() == 1 and this.stack_depth() == 0: | ||||
|                 this.push() | ||||
|                 this.pop() | ||||
|             elif this.buffer_length() == 0 and this.stack_depth() == 1: | ||||
|                 this.pop() | ||||
|             elif this.buffer_length() == 0 and this.stack_depth() >= 2: | ||||
|                 if this.has_head(this.S(0)): | ||||
|         # while this.buffer_length() == 0 \ | ||||
|         # or this.stack_depth() == 0 \ | ||||
|         # or Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE): | ||||
|         #     if this.buffer_length() == 1 and this.stack_depth() == 0: | ||||
|         #         this.push() | ||||
|         #         this.pop() | ||||
|         #     elif this.buffer_length() == 0 and this.stack_depth() == 1: | ||||
|         #         this.pop() | ||||
|         #     elif this.buffer_length() == 0 and this.stack_depth() >= 2: | ||||
|         #         if this.has_head(this.S(0)): | ||||
|         #             this.pop() | ||||
|         #         else: | ||||
|         #             this.unshift() | ||||
|         #     elif (this.length - this._b_i) >= 1 and this.stack_depth() == 0: | ||||
|         #         this.push() | ||||
|         #     elif Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE): | ||||
|         #         this.add_arc(this.B(0), this.S(0), 0) | ||||
|         #         this.pop() | ||||
|         #     else: | ||||
|         #         break | ||||
| 
 | ||||
|         # space token attachement policy: | ||||
|         # - attach space tokens always to the last preceding real token | ||||
|         # - except if it's the beginning of a sentence, then attach to the first following | ||||
|         # - boundary case: a document containing multiple space tokens but nothing else, | ||||
|         #   then make the last space token the head of all others | ||||
| 
 | ||||
|         while is_space_token(this.B_(0)) \ | ||||
|         or this.buffer_length() == 0 \ | ||||
|         or this.stack_depth() == 0: | ||||
|             if this.buffer_length() == 0: | ||||
|                 # remove the last sentence's root from the stack | ||||
|                 if this.stack_depth() == 1: | ||||
|                     this.pop() | ||||
|                 else: | ||||
|                     this.unshift() | ||||
|             elif (this.length - this._b_i) >= 1 and this.stack_depth() == 0: | ||||
|                 this.push() | ||||
|             elif Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE): | ||||
|                 this.add_arc(this.B(0), this.S(0), 0) | ||||
|                 this.pop() | ||||
|             else: | ||||
|                 # parser got stuck: reduce stack or unshift | ||||
|                 elif this.stack_depth() > 1: | ||||
|                     if this.has_head(this.S(0)): | ||||
|                         this.pop() | ||||
|                     else: | ||||
|                         this.unshift() | ||||
|                 # stack is empty but there is another sentence on the buffer | ||||
|                 elif (this.length - this._b_i) >= 1: | ||||
|                     this.push() | ||||
|                 else: # stack empty and nothing else coming | ||||
|                     break | ||||
| 
 | ||||
|             elif is_space_token(this.B_(0)): | ||||
|                 # the normal case: we're somewhere inside a sentence | ||||
|                 if this.stack_depth() > 0: | ||||
|                     # assert not is_space_token(this.S_(0)) | ||||
|                     # attach all coming space tokens to their last preceding | ||||
|                     # real token (which should be on the top of the stack) | ||||
|                     while is_space_token(this.B_(0)): | ||||
|                         this.add_arc(this.S(0),this.B(0),0) | ||||
|                         this.push() | ||||
|                         this.pop() | ||||
|                 # the rare case: we're at the beginning of a document: | ||||
|                 # space tokens are attached to the first real token on the buffer | ||||
|                 elif this.stack_depth() == 0: | ||||
|                     # store all space tokens on the stack until a real token shows up | ||||
|                     # or the last token on the buffer is reached | ||||
|                     while is_space_token(this.B_(0)) and this.buffer_length() > 1: | ||||
|                         this.push() | ||||
|                     # empty the stack by attaching all space tokens to the | ||||
|                     # first token on the buffer | ||||
|                     # boundary case: if all tokens are space tokens, the last one | ||||
|                     # becomes the head of all others | ||||
|                     while this.stack_depth() > 0: | ||||
|                         this.add_arc(this.B(0),this.S(0),0) | ||||
|                         this.pop() | ||||
|                     # move the first token onto the stack | ||||
|                     this.push() | ||||
| 
 | ||||
|             elif this.stack_depth() == 0: | ||||
|                 # for one token sentences (?) | ||||
|                 if this.buffer_length() == 1: | ||||
|                     this.push() | ||||
|                     this.pop() | ||||
|                 # with an empty stack and a non-empty buffer | ||||
|                 # only shift is valid anyway | ||||
|                 elif (this.length - this._b_i) >= 1: | ||||
|                     this.push() | ||||
| 
 | ||||
|             else: # can this even happen? | ||||
|                 break | ||||
|  |  | |||
|  | @ -17,7 +17,7 @@ from libc.string cimport memcpy | |||
| 
 | ||||
| from cymem.cymem cimport Pool | ||||
| from .stateclass cimport StateClass | ||||
| from ._state cimport StateC | ||||
| from ._state cimport StateC, is_space_token | ||||
| 
 | ||||
| 
 | ||||
| DEF NON_MONOTONIC = True | ||||
|  | @ -166,7 +166,7 @@ cdef class Reduce: | |||
| cdef class LeftArc: | ||||
|     @staticmethod | ||||
|     cdef bint is_valid(const StateC* st, int label) nogil: | ||||
|         return not st.B_(0).sent_start and not Lexeme.c_check_flag(st.B_(0).lex,IS_SPACE) | ||||
|         return not st.B_(0).sent_start | ||||
| 
 | ||||
|     @staticmethod | ||||
|     cdef int transition(StateC* st, int label) nogil: | ||||
|  | @ -199,7 +199,7 @@ cdef class LeftArc: | |||
| cdef class RightArc: | ||||
|     @staticmethod | ||||
|     cdef bint is_valid(const StateC* st, int label) nogil: | ||||
|         return not st.B_(0).sent_start and not Lexeme.c_check_flag(st.S_(0).lex,IS_SPACE) | ||||
|         return not st.B_(0).sent_start | ||||
| 
 | ||||
|     @staticmethod | ||||
|     cdef int transition(StateC* st, int label) nogil: | ||||
|  | @ -233,11 +233,23 @@ cdef class Break: | |||
|             return False | ||||
|         elif st.at_break(): | ||||
|             return False | ||||
|         elif st.B(0) == 0: | ||||
|             return False | ||||
|         # unnecessary, since the first item in the buffer is always put onto the stack | ||||
|         # automatically by fast_forward() in initialize_state() | ||||
|         # elif st.B(0) == 0: | ||||
|         #     return False | ||||
|         elif st.stack_depth() < 1: | ||||
|             return False | ||||
|         elif (st.S(0) + 1) != st.B(0): | ||||
|         # It is okay to predict a sentence boundary if the top item on the stack | ||||
|         # and the first item on the buffer are adjacent tokens. If this is not the | ||||
|         # case, it is still okay if there are only space tokens in between. | ||||
|         # This is checked by testing whether the head of a space token immediately | ||||
|         # preceding the first item in the buffer is the top item on the stack. | ||||
|         # Intervening space tokens must be attached to the previous non-space token. | ||||
|         # Therefore, if the head of a space token that immediately precedes the first | ||||
|         # item on the buffer is the top item on the stack, a sentence boundary can be | ||||
|         # predicted. | ||||
|         elif (st.S(0) + 1) != st.B(0) \ | ||||
|         and not (is_space_token(st.safe_get(st.B(0)-1)) and st.H(st.B(0)-1) == st.S(0)): | ||||
|             # Must break at the token boundary | ||||
|             return False | ||||
|         else: | ||||
|  |  | |||
|  | @ -188,9 +188,11 @@ cdef class Parser: | |||
| 
 | ||||
|             action = self.moves.c[guess] | ||||
|             if not eg.is_valid[guess]: | ||||
|                 with gil: | ||||
|                     move_name = self.moves.move_name(action.move, action.label) | ||||
|                     return 1 | ||||
|                 # with gil: | ||||
|                 #     move_name = self.moves.move_name(action.move, action.label) | ||||
|                 #     print 'invalid action:', move_name | ||||
|                 return 1 | ||||
| 
 | ||||
|             action.do(state, action.label) | ||||
|             memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class) | ||||
|             for i in range(eg.nr_class): | ||||
|  | @ -275,12 +277,12 @@ cdef class StepwiseState: | |||
| 
 | ||||
|     @property | ||||
|     def heads(self): | ||||
|         return [self.stcls.H(i) for i in range(self.stcls.length)] | ||||
|         return [self.stcls.H(i) for i in range(self.stcls.c.length)] | ||||
| 
 | ||||
|     @property | ||||
|     def deps(self): | ||||
|         return [self.doc.vocab.strings[self.stcls.c._sent[i].dep] | ||||
|                 for i in range(self.stcls.length)] | ||||
|                 for i in range(self.stcls.c.length)] | ||||
| 
 | ||||
|     def predict(self): | ||||
|         self.eg.reset() | ||||
|  |  | |||
|  | @ -17,11 +17,11 @@ cdef class StateClass: | |||
| 
 | ||||
|     @property | ||||
|     def stack(self): | ||||
|         return {self.S(i) for i in range(self._s_i)} | ||||
|         return {self.S(i) for i in range(self.c._s_i)} | ||||
| 
 | ||||
|     @property | ||||
|     def queue(self): | ||||
|         return {self.B(i) for i in range(self._b_i)} | ||||
|         return {self.B(i) for i in range(self.c._b_i)} | ||||
| 
 | ||||
|     def print_state(self, words): | ||||
|         words = list(words) + ['_'] | ||||
|  |  | |||
|  | @ -1,3 +1,5 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
|  | @ -6,3 +8,14 @@ def test_root(EN): | |||
|     tokens = EN(u"i don't have other assistance") | ||||
|     for t in tokens: | ||||
|         assert t.dep != 0, t.orth_ | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_one_word_sentence(EN): | ||||
| 	# one word sentence | ||||
| 	doc = EN.tokenizer.tokens_from_list(['Hello']) | ||||
| 	EN.tagger(doc) | ||||
| 	assert len(doc) == 1 | ||||
| 	with EN.parser.step_through(doc) as _: | ||||
| 		pass | ||||
| 	assert doc[0].dep != 0 | ||||
|  |  | |||
|  | @ -3,6 +3,7 @@ from __future__ import unicode_literals | |||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_single_period(EN): | ||||
|     string = 'A test sentence.' | ||||
|  | @ -37,3 +38,85 @@ def test_single_question(EN): | |||
|     assert len(words) == 4 | ||||
|     assert len(list(words.sents)) == 1 | ||||
|     assert sum(len(sent) for sent in words.sents) == len(words) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_sentence_breaks_no_space(EN): | ||||
|     doc = EN.tokenizer.tokens_from_list('This is a sentence . This is another one .'.split(' ')) | ||||
|     EN.tagger(doc) | ||||
|     with EN.parser.step_through(doc) as stepwise: | ||||
|         # stack empty, automatic Shift (This) | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('L-nsubj') # attach This | ||||
|         # stack empty, automatic Shift (is) | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('S') # shift a | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('L-det') # attach a | ||||
|         assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('R-attr') # attach sentence | ||||
|         stepwise.transition('D') # remove sentence | ||||
|         assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('R-punct') # attach . | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('B-ROOT') # set sentence start on This | ||||
|         # automatic reduction of the stack, automatic Shift to start second sentence | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('L-nsubj') # attach This | ||||
|         # stack empty, automatic Shift (is) | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('S') # shift another | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('L-attr') # attach another | ||||
|         assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('R-attr') # attach one | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('D') # remove one | ||||
|         assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('R-punct') # attach . | ||||
|         # buffer empty, automatic cleanup | ||||
|     assert len(list(doc.sents)) == 2 | ||||
|     for tok in doc: | ||||
|         assert tok.dep != 0 or tok.is_space | ||||
|     assert [ tok.head.i for tok in doc ] == [1,1,3,1,1,6,6,8,6,6] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_sentence_breaks_with_space(EN): | ||||
|     doc = EN.tokenizer.tokens_from_list('\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' ')) | ||||
|     EN.tagger(doc) | ||||
|     with EN.parser.step_through(doc) as stepwise: | ||||
|         # stack empty, automatic Shift (This) | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('L-nsubj') # attach This | ||||
|         # stack empty, automatic Shift (is) | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('S') # shift a | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('L-det') # attach a | ||||
|         assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('R-attr') # attach sentence | ||||
|         stepwise.transition('D') # remove sentence | ||||
|         assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('R-punct') # attach . | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('B-ROOT') # set sentence start on This | ||||
|         # automatic reduction of the stack, automatic Shift to start second sentence | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('L-nsubj') # attach This | ||||
|         # stack empty, automatic Shift (is) | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('S') # shift another | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('L-attr') # attach another | ||||
|         assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('R-attr') # attach one | ||||
|         assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('D') # remove one | ||||
|         assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') | ||||
|         stepwise.transition('R-punct') # attach . | ||||
|         # buffer empty, automatic cleanup | ||||
|     assert len(list(doc.sents)) == 2 | ||||
|     for tok in doc: | ||||
|         assert tok.dep != 0 or tok.is_space | ||||
|     assert [ tok.head.i for tok in doc ] == [1,2,2,2,5,2,5,5,2,8,8,8,13,13,16,14,13,13] | ||||
|  |  | |||
|  | @ -4,6 +4,12 @@ import pytest | |||
| import numpy | ||||
| from spacy.attrs import HEAD | ||||
| 
 | ||||
| def make_doc(EN, sentstr): | ||||
| 	sent = sentstr.split(' ') | ||||
| 	doc = EN.tokenizer.tokens_from_list(sent) | ||||
| 	EN.tagger(doc) | ||||
| 	return doc | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_space_attachment(EN): | ||||
|  | @ -22,3 +28,63 @@ def test_sentence_space(EN): | |||
|     doc = EN(text) | ||||
|     assert len(list(doc.sents)) == 2 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_space_attachment_leading_space(EN): | ||||
| 	# leading space token | ||||
| 	doc = make_doc(EN, '\t \n This is a sentence .') | ||||
| 	assert doc[0].is_space | ||||
| 	assert doc[1].is_space | ||||
| 	assert doc[2].orth_ == 'This' | ||||
| 	with EN.parser.step_through(doc) as stepwise: | ||||
| 		pass | ||||
| 	assert doc[0].head.i == 2 | ||||
| 	assert doc[1].head.i == 2 | ||||
| 	assert stepwise.stack == set([2]) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_space_attachment_intermediate_and_trailing_space(EN): | ||||
| 	# intermediate and trailing space tokens | ||||
| 	doc = make_doc(EN, 'This is \t a \t\n \n sentence . \n\n \n') | ||||
| 	assert doc[2].is_space | ||||
| 	assert doc[4].is_space | ||||
| 	assert doc[5].is_space | ||||
| 	assert doc[8].is_space | ||||
| 	assert doc[9].is_space | ||||
| 	with EN.parser.step_through(doc) as stepwise: | ||||
| 		stepwise.transition('L-nsubj') | ||||
| 		stepwise.transition('S') | ||||
| 		stepwise.transition('L-det') | ||||
| 		stepwise.transition('R-attr') | ||||
| 		stepwise.transition('D') | ||||
| 		stepwise.transition('R-punct') | ||||
| 	assert stepwise.stack == set([]) | ||||
| 	for tok in doc: | ||||
| 		assert tok.dep != 0 or tok.is_space | ||||
| 	assert [ tok.head.i for tok in doc ] == [1,1,1,6,3,3,1,1,7,7] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_space_attachment_one_space_sentence(EN): | ||||
| 	# one space token sentence | ||||
| 	doc = make_doc(EN, '\n') | ||||
| 	assert len(doc) == 1 | ||||
| 	with EN.parser.step_through(doc) as _: | ||||
| 		pass | ||||
| 	assert doc[0].is_space | ||||
| 	assert doc[0].head.i == 0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.models | ||||
| def test_space_attachment_only_space_sentence(EN): | ||||
| 	# space-exclusive sentence | ||||
| 	doc = make_doc(EN, '\n \t \n\n \t') | ||||
| 	assert len(doc) == 4 | ||||
| 	for tok in doc: | ||||
| 		assert tok.is_space | ||||
| 	with EN.parser.step_through(doc) as _: | ||||
| 		pass | ||||
| 	# all tokens are attached to the last one | ||||
| 	for tok in doc: | ||||
| 		assert tok.head.i == 3 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user