Prevent exceptions from setting POS but not TAG. Closes #1773

This commit is contained in:
Matthew Honnibal 2018-12-30 13:15:23 +01:00
parent b665a32b95
commit ee4d06fb1b
2 changed files with 13 additions and 9 deletions

View File

@ -5,7 +5,7 @@ from __future__ import unicode_literals
# back-tracking. See Issue #957 # back-tracking. See Issue #957
import regex as re import regex as re
from ..symbols import ORTH, POS, LEMMA, SPACE, PUNCT from ..symbols import ORTH, POS, TAG, LEMMA, SPACE, PUNCT
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
@ -60,13 +60,13 @@ BASE_EXCEPTIONS = {}
for exc_data in [ for exc_data in [
{ORTH: " ", POS: SPACE}, {ORTH: " ", POS: SPACE, TAG: "_SP"},
{ORTH: "\t", POS: SPACE}, {ORTH: "\t", POS: SPACE, TAG: "_SP"},
{ORTH: "\\t", POS: SPACE}, {ORTH: "\\t", POS: SPACE, TAG: "_SP"},
{ORTH: "\n", POS: SPACE}, {ORTH: "\n", POS: SPACE, TAG: "_SP"},
{ORTH: "\\n", POS: SPACE}, {ORTH: "\\n", POS: SPACE, TAG: "_SP"},
{ORTH: "\u2014", POS: PUNCT, LEMMA: "--"}, {ORTH: "\u2014"},
{ORTH: "\u00a0", POS: SPACE, LEMMA: " "}, {ORTH: "\u00a0", POS: SPACE, LEMMA: " ", TAG: "_SP"},
]: ]:
BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data] BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]

View File

@ -11,7 +11,7 @@ from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .tokens.token cimport Token from .tokens.token cimport Token
from .attrs cimport PROB, LANG, ORTH, TAG from .attrs cimport PROB, LANG, ORTH, TAG, POS
from .structs cimport SerializedLexemeC from .structs cimport SerializedLexemeC
from .compat import copy_reg, basestring_ from .compat import copy_reg, basestring_
@ -232,6 +232,10 @@ cdef class Vocab:
token.lex = lex token.lex = lex
if TAG in props: if TAG in props:
self.morphology.assign_tag(token, props[TAG]) self.morphology.assign_tag(token, props[TAG])
elif POS in props:
# Don't allow POS to be set without TAG -- this causes problems,
# see #1773
props.pop(POS)
for attr_id, value in props.items(): for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value) Token.set_struct_attr(token, attr_id, value)
# NORM is the only one that overlaps between the two # NORM is the only one that overlaps between the two