mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
* Add SPACE part-of-speech tag, and train tagger to assign it. Also train tagger not to make whitespace an entity
This commit is contained in:
parent
f95da0bd52
commit
89a91ad726
|
@ -11,7 +11,7 @@ from thinc.typedefs cimport atom_t, weight_t
|
|||
from ..parts_of_speech cimport univ_pos_t
|
||||
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
||||
|
||||
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL
|
||||
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
|
||||
from ..typedefs cimport id_t
|
||||
from ..structs cimport TokenC, Morphology, LexemeC
|
||||
from ..tokens cimport Doc
|
||||
|
@ -180,7 +180,8 @@ POS_TAGS = {
|
|||
"HYPH": (PUNCT, {}),
|
||||
"XX": (X, {}),
|
||||
"BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||
"HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD})
|
||||
"HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||
"SP": (SPACE, {})
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -218,8 +218,12 @@ cdef class GoldParse:
|
|||
self.orig_annot = zip(*annot_tuples)
|
||||
|
||||
for i, gold_i in enumerate(self.cand_to_gold):
|
||||
if self.words[i].isspace():
|
||||
self.tags[i] = 'SP'
|
||||
self.heads[i] = None
|
||||
self.labels[i] = None
|
||||
self.ner[i] = 'O'
|
||||
if gold_i is None:
|
||||
# TODO: What do we do for missing values again?
|
||||
pass
|
||||
else:
|
||||
self.tags[i] = annot_tuples[2][gold_i]
|
||||
|
|
|
@ -14,4 +14,5 @@ cpdef enum univ_pos_t:
|
|||
X
|
||||
PUNCT
|
||||
EOL
|
||||
SPACE
|
||||
N_UNIV_TAGS
|
||||
|
|
|
@ -15,5 +15,6 @@ UNIV_POS_NAMES = {
|
|||
"VERB": VERB,
|
||||
"X": X,
|
||||
"PUNCT": PUNCT,
|
||||
"SPACE": SPACE,
|
||||
"EOL": EOL
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user