mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Prevent exceptions from setting POS but not TAG. Closes #1773
This commit is contained in:
parent
b665a32b95
commit
ee4d06fb1b
|
@ -5,7 +5,7 @@ from __future__ import unicode_literals
|
|||
# back-tracking. See Issue #957
|
||||
import regex as re
|
||||
|
||||
from ..symbols import ORTH, POS, LEMMA, SPACE, PUNCT
|
||||
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE, PUNCT
|
||||
|
||||
|
||||
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
||||
|
@ -60,13 +60,13 @@ BASE_EXCEPTIONS = {}
|
|||
|
||||
|
||||
for exc_data in [
|
||||
{ORTH: " ", POS: SPACE},
|
||||
{ORTH: "\t", POS: SPACE},
|
||||
{ORTH: "\\t", POS: SPACE},
|
||||
{ORTH: "\n", POS: SPACE},
|
||||
{ORTH: "\\n", POS: SPACE},
|
||||
{ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
|
||||
{ORTH: "\u00a0", POS: SPACE, LEMMA: " "},
|
||||
{ORTH: " ", POS: SPACE, TAG: "_SP"},
|
||||
{ORTH: "\t", POS: SPACE, TAG: "_SP"},
|
||||
{ORTH: "\\t", POS: SPACE, TAG: "_SP"},
|
||||
{ORTH: "\n", POS: SPACE, TAG: "_SP"},
|
||||
{ORTH: "\\n", POS: SPACE, TAG: "_SP"},
|
||||
{ORTH: "\u2014"},
|
||||
{ORTH: "\u00a0", POS: SPACE, LEMMA: " ", TAG: "_SP"},
|
||||
]:
|
||||
BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ from .lexeme cimport EMPTY_LEXEME
|
|||
from .lexeme cimport Lexeme
|
||||
from .typedefs cimport attr_t
|
||||
from .tokens.token cimport Token
|
||||
from .attrs cimport PROB, LANG, ORTH, TAG
|
||||
from .attrs cimport PROB, LANG, ORTH, TAG, POS
|
||||
from .structs cimport SerializedLexemeC
|
||||
|
||||
from .compat import copy_reg, basestring_
|
||||
|
@ -232,6 +232,10 @@ cdef class Vocab:
|
|||
token.lex = lex
|
||||
if TAG in props:
|
||||
self.morphology.assign_tag(token, props[TAG])
|
||||
elif POS in props:
|
||||
# Don't allow POS to be set without TAG -- this causes problems,
|
||||
# see #1773
|
||||
props.pop(POS)
|
||||
for attr_id, value in props.items():
|
||||
Token.set_struct_attr(token, attr_id, value)
|
||||
# NORM is the only one that overlaps between the two
|
||||
|
|
Loading…
Reference in New Issue
Block a user