* Add SPACE part-of-speech tag, and train tagger to assign it. Also train tagger not to make whitespace an entity

2026-03-04 03:41:29 +03:00 · 2015-07-09 13:30:41 +02:00 · 2015-07-09 13:30:41 +02:00 · 89a91ad726
commit 89a91ad726
parent f95da0bd52
4 changed files with 10 additions and 3 deletions
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -11,7 +11,7 @@ from thinc.typedefs cimport atom_t, weight_t
 from ..parts_of_speech cimport univ_pos_t
 from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON

-from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL
+from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
 from ..typedefs cimport id_t
 from ..structs cimport TokenC, Morphology, LexemeC
 from ..tokens cimport Doc
@ -180,7 +180,8 @@ POS_TAGS = {
    "HYPH": (PUNCT, {}),
    "XX": (X, {}),
    "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
-    "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD})
+    "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
+    "SP": (SPACE, {})
 }


--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -218,8 +218,12 @@ cdef class GoldParse:
        self.orig_annot = zip(*annot_tuples)

        for i, gold_i in enumerate(self.cand_to_gold):
+            if self.words[i].isspace():
+                self.tags[i] = 'SP'
+                self.heads[i] = None
+                self.labels[i] = None
+                self.ner[i] = 'O'
            if gold_i is None:
-                # TODO: What do we do for missing values again?
                pass
            else:
                self.tags[i] = annot_tuples[2][gold_i]
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@ -14,4 +14,5 @@ cpdef enum univ_pos_t:
    X
    PUNCT
    EOL
+    SPACE
    N_UNIV_TAGS
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -15,5 +15,6 @@ UNIV_POS_NAMES = {
    "VERB": VERB,
    "X": X,
    "PUNCT": PUNCT,
+    "SPACE": SPACE,
    "EOL": EOL
 }