mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Add SPACE part-of-speech tag, and train tagger to assign it. Also train tagger not to make whitespace an entity
This commit is contained in:
parent
f95da0bd52
commit
89a91ad726
|
@ -11,7 +11,7 @@ from thinc.typedefs cimport atom_t, weight_t
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
||||||
|
|
||||||
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL
|
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
|
||||||
from ..typedefs cimport id_t
|
from ..typedefs cimport id_t
|
||||||
from ..structs cimport TokenC, Morphology, LexemeC
|
from ..structs cimport TokenC, Morphology, LexemeC
|
||||||
from ..tokens cimport Doc
|
from ..tokens cimport Doc
|
||||||
|
@ -180,7 +180,8 @@ POS_TAGS = {
|
||||||
"HYPH": (PUNCT, {}),
|
"HYPH": (PUNCT, {}),
|
||||||
"XX": (X, {}),
|
"XX": (X, {}),
|
||||||
"BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
"BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||||
"HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD})
|
"HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
|
||||||
|
"SP": (SPACE, {})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -218,8 +218,12 @@ cdef class GoldParse:
|
||||||
self.orig_annot = zip(*annot_tuples)
|
self.orig_annot = zip(*annot_tuples)
|
||||||
|
|
||||||
for i, gold_i in enumerate(self.cand_to_gold):
|
for i, gold_i in enumerate(self.cand_to_gold):
|
||||||
|
if self.words[i].isspace():
|
||||||
|
self.tags[i] = 'SP'
|
||||||
|
self.heads[i] = None
|
||||||
|
self.labels[i] = None
|
||||||
|
self.ner[i] = 'O'
|
||||||
if gold_i is None:
|
if gold_i is None:
|
||||||
# TODO: What do we do for missing values again?
|
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
self.tags[i] = annot_tuples[2][gold_i]
|
self.tags[i] = annot_tuples[2][gold_i]
|
||||||
|
|
|
@ -14,4 +14,5 @@ cpdef enum univ_pos_t:
|
||||||
X
|
X
|
||||||
PUNCT
|
PUNCT
|
||||||
EOL
|
EOL
|
||||||
|
SPACE
|
||||||
N_UNIV_TAGS
|
N_UNIV_TAGS
|
||||||
|
|
|
@ -15,5 +15,6 @@ UNIV_POS_NAMES = {
|
||||||
"VERB": VERB,
|
"VERB": VERB,
|
||||||
"X": X,
|
"X": X,
|
||||||
"PUNCT": PUNCT,
|
"PUNCT": PUNCT,
|
||||||
|
"SPACE": SPACE,
|
||||||
"EOL": EOL
|
"EOL": EOL
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user