Prevent exceptions from setting POS but not TAG. Closes #1773

This commit is contained in:
Matthew Honnibal 2018-12-30 13:15:23 +01:00
parent b665a32b95
commit ee4d06fb1b
2 changed files with 13 additions and 9 deletions

View File

@ -5,7 +5,7 @@ from __future__ import unicode_literals
# back-tracking. See Issue #957
import regex as re
from ..symbols import ORTH, POS, LEMMA, SPACE, PUNCT
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE, PUNCT
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
@ -60,13 +60,13 @@ BASE_EXCEPTIONS = {}
for exc_data in [
{ORTH: " ", POS: SPACE},
{ORTH: "\t", POS: SPACE},
{ORTH: "\\t", POS: SPACE},
{ORTH: "\n", POS: SPACE},
{ORTH: "\\n", POS: SPACE},
{ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
{ORTH: "\u00a0", POS: SPACE, LEMMA: " "},
{ORTH: " ", POS: SPACE, TAG: "_SP"},
{ORTH: "\t", POS: SPACE, TAG: "_SP"},
{ORTH: "\\t", POS: SPACE, TAG: "_SP"},
{ORTH: "\n", POS: SPACE, TAG: "_SP"},
{ORTH: "\\n", POS: SPACE, TAG: "_SP"},
{ORTH: "\u2014"},
{ORTH: "\u00a0", POS: SPACE, LEMMA: " ", TAG: "_SP"},
]:
BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]

View File

@ -11,7 +11,7 @@ from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme
from .typedefs cimport attr_t
from .tokens.token cimport Token
from .attrs cimport PROB, LANG, ORTH, TAG
from .attrs cimport PROB, LANG, ORTH, TAG, POS
from .structs cimport SerializedLexemeC
from .compat import copy_reg, basestring_
@ -232,6 +232,10 @@ cdef class Vocab:
token.lex = lex
if TAG in props:
self.morphology.assign_tag(token, props[TAG])
elif POS in props:
# Don't allow POS to be set without TAG -- this causes problems,
# see #1773
props.pop(POS)
for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value)
# NORM is the only one that overlaps between the two