Try to prevent spaces from being tagged as entities

This commit is contained in:
Matthew Honnibal 2018-12-07 00:12:12 +00:00
parent 2d0c366101
commit 1e6725e9b7

View File

@ -10,6 +10,8 @@ from ._state cimport StateC
from .transition_system cimport Transition from .transition_system cimport Transition
from .transition_system cimport do_func_t from .transition_system cimport do_func_t
from ..gold cimport GoldParseC, GoldParse from ..gold cimport GoldParseC, GoldParse
from ..lexeme cimport Lexeme
from ..attrs cimport IS_SPACE
from ..errors import Errors from ..errors import Errors
@ -273,6 +275,9 @@ cdef class Begin:
# Don't allow entities to extend across sentence boundaries # Don't allow entities to extend across sentence boundaries
elif st.B_(1).sent_start == 1: elif st.B_(1).sent_start == 1:
return False return False
# Don't allow entities to start on whitespace
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
return False
else: else:
return label != 0 and not st.entity_is_open() return label != 0 and not st.entity_is_open()
@ -366,6 +371,9 @@ cdef class Last:
cdef bint is_valid(const StateC* st, attr_t label) nogil: cdef bint is_valid(const StateC* st, attr_t label) nogil:
if st.B_(1).ent_iob == 1: if st.B_(1).ent_iob == 1:
return False return False
# Don't allow entities to end on whitespace
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
return False
return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
@staticmethod @staticmethod
@ -418,6 +426,8 @@ cdef class Unit:
return False return False
elif st.B_(1).ent_iob == 1: elif st.B_(1).ent_iob == 1:
return False return False
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
return False
return label != 0 and not st.entity_is_open() return label != 0 and not st.entity_is_open()
@staticmethod @staticmethod