mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Add SPACE part-of-speech tag, and train tagger to assign it. Also train tagger not to make whitespace an entity
This commit is contained in:
		
							parent
							
								
									f95da0bd52
								
							
						
					
					
						commit
						89a91ad726
					
				| 
						 | 
				
			
			@ -11,7 +11,7 @@ from thinc.typedefs cimport atom_t, weight_t
 | 
			
		|||
from ..parts_of_speech cimport univ_pos_t
 | 
			
		||||
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
 | 
			
		||||
 | 
			
		||||
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL
 | 
			
		||||
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
 | 
			
		||||
from ..typedefs cimport id_t
 | 
			
		||||
from ..structs cimport TokenC, Morphology, LexemeC
 | 
			
		||||
from ..tokens cimport Doc
 | 
			
		||||
| 
						 | 
				
			
			@ -180,7 +180,8 @@ POS_TAGS = {
 | 
			
		|||
    "HYPH": (PUNCT, {}),
 | 
			
		||||
    "XX": (X, {}),
 | 
			
		||||
    "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
 | 
			
		||||
    "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD})
 | 
			
		||||
    "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
 | 
			
		||||
    "SP": (SPACE, {})
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -218,8 +218,12 @@ cdef class GoldParse:
 | 
			
		|||
        self.orig_annot = zip(*annot_tuples)
 | 
			
		||||
 | 
			
		||||
        for i, gold_i in enumerate(self.cand_to_gold):
 | 
			
		||||
            if self.words[i].isspace():
 | 
			
		||||
                self.tags[i] = 'SP'
 | 
			
		||||
                self.heads[i] = None
 | 
			
		||||
                self.labels[i] = None
 | 
			
		||||
                self.ner[i] = 'O'
 | 
			
		||||
            if gold_i is None:
 | 
			
		||||
                # TODO: What do we do for missing values again?
 | 
			
		||||
                pass
 | 
			
		||||
            else:
 | 
			
		||||
                self.tags[i] = annot_tuples[2][gold_i]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,4 +14,5 @@ cpdef enum univ_pos_t:
 | 
			
		|||
    X
 | 
			
		||||
    PUNCT
 | 
			
		||||
    EOL
 | 
			
		||||
    SPACE
 | 
			
		||||
    N_UNIV_TAGS
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -15,5 +15,6 @@ UNIV_POS_NAMES = {
 | 
			
		|||
    "VERB": VERB,
 | 
			
		||||
    "X": X,
 | 
			
		||||
    "PUNCT": PUNCT,
 | 
			
		||||
    "SPACE": SPACE,
 | 
			
		||||
    "EOL": EOL
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user