mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Fix POS and dependency label tag names. Add parse and string navigation functions.
This commit is contained in:
		
							parent
							
								
									cb6a526fcd
								
							
						
					
					
						commit
						a97bed9359
					
				| 
						 | 
				
			
			@ -252,7 +252,7 @@ cdef class EnPosTagger:
 | 
			
		|||
                scores = self.model.score(context)
 | 
			
		||||
                tokens.data[i].tag = arg_max(scores, self.model.n_classes)
 | 
			
		||||
                self.set_morph(i, tokens.data)
 | 
			
		||||
        tokens.pos_scheme = self.tag_map
 | 
			
		||||
        tokens._tag_strings = self.tag_names
 | 
			
		||||
 | 
			
		||||
    def train(self, Tokens tokens, object golds):
 | 
			
		||||
        cdef int i
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,5 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from ._state cimport State
 | 
			
		||||
from ._state cimport has_head, get_idx, get_s0, get_n0
 | 
			
		||||
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
 | 
			
		||||
| 
						 | 
				
			
			@ -106,12 +108,14 @@ cdef class TransitionSystem:
 | 
			
		|||
        self.label_ids = {'ROOT': 0}
 | 
			
		||||
        cdef int label_id
 | 
			
		||||
        for label_str in left_labels:
 | 
			
		||||
            label_str = unicode(label_str)
 | 
			
		||||
            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
 | 
			
		||||
            moves[i].move = LEFT
 | 
			
		||||
            moves[i].label = label_id
 | 
			
		||||
            moves[i].clas = i
 | 
			
		||||
            i += 1
 | 
			
		||||
        for label_str in right_labels:
 | 
			
		||||
            label_str = unicode(label_str)
 | 
			
		||||
            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
 | 
			
		||||
            moves[i].move = RIGHT
 | 
			
		||||
            moves[i].label = label_id
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -79,6 +79,10 @@ cdef class GreedyParser:
 | 
			
		|||
            scores = self.model.score(context)
 | 
			
		||||
            guess = self.moves.best_valid(scores, state)
 | 
			
		||||
            self.moves.transition(state, &guess)
 | 
			
		||||
        # Messily tell Tokens object the string names of the dependency labels
 | 
			
		||||
        tokens._dep_strings = [None] * len(self.moves.label_ids)
 | 
			
		||||
        for label, id_ in self.moves.label_ids.items():
 | 
			
		||||
            tokens._dep_strings[id_] = label
 | 
			
		||||
        return 0
 | 
			
		||||
 | 
			
		||||
    def train_sent(self, Tokens tokens, list gold_heads, list gold_labels):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,7 @@
 | 
			
		|||
from libc.stdint cimport uint32_t
 | 
			
		||||
 | 
			
		||||
from numpy cimport ndarray
 | 
			
		||||
cimport numpy
 | 
			
		||||
 | 
			
		||||
from cymem.cymem cimport Pool
 | 
			
		||||
from thinc.typedefs cimport atom_t
 | 
			
		||||
| 
						 | 
				
			
			@ -29,11 +30,13 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
 | 
			
		|||
cdef class Tokens:
 | 
			
		||||
    cdef Pool mem
 | 
			
		||||
    cdef Vocab vocab
 | 
			
		||||
    cdef list tag_names
 | 
			
		||||
    cdef dict pos_scheme
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
    cdef TokenC* data
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    cdef unicode _string
 | 
			
		||||
    cdef list _tag_strings
 | 
			
		||||
    cdef list _dep_strings
 | 
			
		||||
 | 
			
		||||
    cdef int length
 | 
			
		||||
    cdef int max_length
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,4 @@
 | 
			
		|||
# cython: embedsignature=True
 | 
			
		||||
from cython.view cimport array as cvarray
 | 
			
		||||
 | 
			
		||||
from preshed.maps cimport PreshMap
 | 
			
		||||
from preshed.counter cimport PreshCounter
 | 
			
		||||
| 
						 | 
				
			
			@ -9,6 +8,7 @@ from .typedefs cimport attr_id_t, attr_t
 | 
			
		|||
from .typedefs cimport LEMMA
 | 
			
		||||
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 | 
			
		||||
from .typedefs cimport POS, LEMMA
 | 
			
		||||
from .typedefs import UNIV_TAG_NAMES
 | 
			
		||||
 | 
			
		||||
from unidecode import unidecode
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -84,6 +84,8 @@ cdef class Tokens:
 | 
			
		|||
        self.data = data_start + PADDING
 | 
			
		||||
        self.max_length = size
 | 
			
		||||
        self.length = 0
 | 
			
		||||
        self._tag_strings = [] # These will be set by the POS tagger and parser
 | 
			
		||||
        self._dep_strings = [] # The strings are arbitrary and model-specific.
 | 
			
		||||
 | 
			
		||||
    def sentences(self):
 | 
			
		||||
        cdef int i
 | 
			
		||||
| 
						 | 
				
			
			@ -148,7 +150,7 @@ cdef class Tokens:
 | 
			
		|||
        return idx + t.lex.length
 | 
			
		||||
 | 
			
		||||
    @cython.boundscheck(False)
 | 
			
		||||
    cpdef long[:,:] to_array(self, object attr_ids):
 | 
			
		||||
    cpdef long[:,:] to_array(self, object py_attr_ids):
 | 
			
		||||
        """Given a list of M attribute IDs, export the tokens to a numpy ndarray
 | 
			
		||||
        of shape N*M, where N is the length of the sentence.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -162,8 +164,11 @@ cdef class Tokens:
 | 
			
		|||
        """
 | 
			
		||||
        cdef int i, j
 | 
			
		||||
        cdef attr_id_t feature
 | 
			
		||||
        cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
 | 
			
		||||
                                        itemsize=sizeof(long), format="l")
 | 
			
		||||
        cdef numpy.ndarray[long, ndim=2] output
 | 
			
		||||
        # Make an array from the attributes --- otherwise our inner loop is Python
 | 
			
		||||
        # dict iteration.
 | 
			
		||||
        cdef numpy.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids)
 | 
			
		||||
        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int)
 | 
			
		||||
        for i in range(self.length):
 | 
			
		||||
            for j, feature in enumerate(attr_ids):
 | 
			
		||||
                output[i, j] = get_token_attr(&self.data[i], feature)
 | 
			
		||||
| 
						 | 
				
			
			@ -232,6 +237,7 @@ cdef class Token:
 | 
			
		|||
        self.sentiment = t.lex.sentiment
 | 
			
		||||
        self.flags = t.lex.flags
 | 
			
		||||
        self.lemma = t.lemma
 | 
			
		||||
        self.pos = t.pos
 | 
			
		||||
        self.tag = t.tag
 | 
			
		||||
        self.dep = t.dep
 | 
			
		||||
        self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
 | 
			
		||||
| 
						 | 
				
			
			@ -248,6 +254,24 @@ cdef class Token:
 | 
			
		|||
        """
 | 
			
		||||
        return self._seq.data[self.i].lex.length
 | 
			
		||||
 | 
			
		||||
    def nbor(self, int i=1):
 | 
			
		||||
        return Token(self._seq, self.i + i)
 | 
			
		||||
 | 
			
		||||
    def child(self, int i=1):
 | 
			
		||||
        cdef const TokenC* t = &self._seq.data[self.i]
 | 
			
		||||
        if i == 0:
 | 
			
		||||
            return self
 | 
			
		||||
        elif i >= 1:
 | 
			
		||||
            if t.r_kids == 0:
 | 
			
		||||
                return None
 | 
			
		||||
            else:
 | 
			
		||||
                return Token(self._seq, _nth_significant_bit(t.r_kids, i))
 | 
			
		||||
        else:
 | 
			
		||||
            if t.l_kids == 0:
 | 
			
		||||
                return None
 | 
			
		||||
            else:
 | 
			
		||||
                return Token(self._seq, _nth_significant_bit(t.l_kids, i))
 | 
			
		||||
        
 | 
			
		||||
    property head:
 | 
			
		||||
        """The token predicted by the parser to be the head of the current token."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -290,10 +314,26 @@ cdef class Token:
 | 
			
		|||
            cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
 | 
			
		||||
            return py_ustr
 | 
			
		||||
 | 
			
		||||
    property pos_:
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()}
 | 
			
		||||
            return id_to_string[self.pos]
 | 
			
		||||
 | 
			
		||||
    property tag_:
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self._seq.tag_names[self.tag]
 | 
			
		||||
            return self._seq._tag_strings[self.tag]
 | 
			
		||||
 | 
			
		||||
    property dep_:
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self._seq.dep_names[self.dep]
 | 
			
		||||
            return self._seq._dep_strings[self.dep]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
 | 
			
		||||
    cdef int i
 | 
			
		||||
    for i in range(32):
 | 
			
		||||
        if bits & (1 << i):
 | 
			
		||||
            n -= 1
 | 
			
		||||
            if n < 1:
 | 
			
		||||
                return i
 | 
			
		||||
    return 0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,6 @@
 | 
			
		|||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
UNIV_TAG_NAMES = {
 | 
			
		||||
    "NO_TAG": NO_TAG,
 | 
			
		||||
    "ADJ": ADJ,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user