mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Fix POS and dependency label tag names. Add parse and string navigation functions.
This commit is contained in:
		
							parent
							
								
									cb6a526fcd
								
							
						
					
					
						commit
						a97bed9359
					
				| 
						 | 
					@ -252,7 +252,7 @@ cdef class EnPosTagger:
 | 
				
			||||||
                scores = self.model.score(context)
 | 
					                scores = self.model.score(context)
 | 
				
			||||||
                tokens.data[i].tag = arg_max(scores, self.model.n_classes)
 | 
					                tokens.data[i].tag = arg_max(scores, self.model.n_classes)
 | 
				
			||||||
                self.set_morph(i, tokens.data)
 | 
					                self.set_morph(i, tokens.data)
 | 
				
			||||||
        tokens.pos_scheme = self.tag_map
 | 
					        tokens._tag_strings = self.tag_names
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def train(self, Tokens tokens, object golds):
 | 
					    def train(self, Tokens tokens, object golds):
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,5 @@
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ._state cimport State
 | 
					from ._state cimport State
 | 
				
			||||||
from ._state cimport has_head, get_idx, get_s0, get_n0
 | 
					from ._state cimport has_head, get_idx, get_s0, get_n0
 | 
				
			||||||
from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
 | 
					from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep
 | 
				
			||||||
| 
						 | 
					@ -106,12 +108,14 @@ cdef class TransitionSystem:
 | 
				
			||||||
        self.label_ids = {'ROOT': 0}
 | 
					        self.label_ids = {'ROOT': 0}
 | 
				
			||||||
        cdef int label_id
 | 
					        cdef int label_id
 | 
				
			||||||
        for label_str in left_labels:
 | 
					        for label_str in left_labels:
 | 
				
			||||||
 | 
					            label_str = unicode(label_str)
 | 
				
			||||||
            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
 | 
					            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
 | 
				
			||||||
            moves[i].move = LEFT
 | 
					            moves[i].move = LEFT
 | 
				
			||||||
            moves[i].label = label_id
 | 
					            moves[i].label = label_id
 | 
				
			||||||
            moves[i].clas = i
 | 
					            moves[i].clas = i
 | 
				
			||||||
            i += 1
 | 
					            i += 1
 | 
				
			||||||
        for label_str in right_labels:
 | 
					        for label_str in right_labels:
 | 
				
			||||||
 | 
					            label_str = unicode(label_str)
 | 
				
			||||||
            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
 | 
					            label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
 | 
				
			||||||
            moves[i].move = RIGHT
 | 
					            moves[i].move = RIGHT
 | 
				
			||||||
            moves[i].label = label_id
 | 
					            moves[i].label = label_id
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -79,6 +79,10 @@ cdef class GreedyParser:
 | 
				
			||||||
            scores = self.model.score(context)
 | 
					            scores = self.model.score(context)
 | 
				
			||||||
            guess = self.moves.best_valid(scores, state)
 | 
					            guess = self.moves.best_valid(scores, state)
 | 
				
			||||||
            self.moves.transition(state, &guess)
 | 
					            self.moves.transition(state, &guess)
 | 
				
			||||||
 | 
					        # Messily tell Tokens object the string names of the dependency labels
 | 
				
			||||||
 | 
					        tokens._dep_strings = [None] * len(self.moves.label_ids)
 | 
				
			||||||
 | 
					        for label, id_ in self.moves.label_ids.items():
 | 
				
			||||||
 | 
					            tokens._dep_strings[id_] = label
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def train_sent(self, Tokens tokens, list gold_heads, list gold_labels):
 | 
					    def train_sent(self, Tokens tokens, list gold_heads, list gold_labels):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,7 @@
 | 
				
			||||||
from libc.stdint cimport uint32_t
 | 
					from libc.stdint cimport uint32_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from numpy cimport ndarray
 | 
					from numpy cimport ndarray
 | 
				
			||||||
 | 
					cimport numpy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from thinc.typedefs cimport atom_t
 | 
					from thinc.typedefs cimport atom_t
 | 
				
			||||||
| 
						 | 
					@ -29,11 +30,13 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
 | 
				
			||||||
cdef class Tokens:
 | 
					cdef class Tokens:
 | 
				
			||||||
    cdef Pool mem
 | 
					    cdef Pool mem
 | 
				
			||||||
    cdef Vocab vocab
 | 
					    cdef Vocab vocab
 | 
				
			||||||
    cdef list tag_names
 | 
					    
 | 
				
			||||||
    cdef dict pos_scheme
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef TokenC* data
 | 
					    cdef TokenC* data
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef unicode _string
 | 
					    cdef unicode _string
 | 
				
			||||||
 | 
					    cdef list _tag_strings
 | 
				
			||||||
 | 
					    cdef list _dep_strings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int length
 | 
					    cdef int length
 | 
				
			||||||
    cdef int max_length
 | 
					    cdef int max_length
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,4 @@
 | 
				
			||||||
# cython: embedsignature=True
 | 
					# cython: embedsignature=True
 | 
				
			||||||
from cython.view cimport array as cvarray
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from preshed.maps cimport PreshMap
 | 
					from preshed.maps cimport PreshMap
 | 
				
			||||||
from preshed.counter cimport PreshCounter
 | 
					from preshed.counter cimport PreshCounter
 | 
				
			||||||
| 
						 | 
					@ -9,6 +8,7 @@ from .typedefs cimport attr_id_t, attr_t
 | 
				
			||||||
from .typedefs cimport LEMMA
 | 
					from .typedefs cimport LEMMA
 | 
				
			||||||
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 | 
					from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 | 
				
			||||||
from .typedefs cimport POS, LEMMA
 | 
					from .typedefs cimport POS, LEMMA
 | 
				
			||||||
 | 
					from .typedefs import UNIV_TAG_NAMES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from unidecode import unidecode
 | 
					from unidecode import unidecode
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -84,6 +84,8 @@ cdef class Tokens:
 | 
				
			||||||
        self.data = data_start + PADDING
 | 
					        self.data = data_start + PADDING
 | 
				
			||||||
        self.max_length = size
 | 
					        self.max_length = size
 | 
				
			||||||
        self.length = 0
 | 
					        self.length = 0
 | 
				
			||||||
 | 
					        self._tag_strings = [] # These will be set by the POS tagger and parser
 | 
				
			||||||
 | 
					        self._dep_strings = [] # The strings are arbitrary and model-specific.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def sentences(self):
 | 
					    def sentences(self):
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
| 
						 | 
					@ -148,7 +150,7 @@ cdef class Tokens:
 | 
				
			||||||
        return idx + t.lex.length
 | 
					        return idx + t.lex.length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @cython.boundscheck(False)
 | 
					    @cython.boundscheck(False)
 | 
				
			||||||
    cpdef long[:,:] to_array(self, object attr_ids):
 | 
					    cpdef long[:,:] to_array(self, object py_attr_ids):
 | 
				
			||||||
        """Given a list of M attribute IDs, export the tokens to a numpy ndarray
 | 
					        """Given a list of M attribute IDs, export the tokens to a numpy ndarray
 | 
				
			||||||
        of shape N*M, where N is the length of the sentence.
 | 
					        of shape N*M, where N is the length of the sentence.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -162,8 +164,11 @@ cdef class Tokens:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef int i, j
 | 
					        cdef int i, j
 | 
				
			||||||
        cdef attr_id_t feature
 | 
					        cdef attr_id_t feature
 | 
				
			||||||
        cdef long[:,:] output = cvarray(shape=(self.length, len(attr_ids)),
 | 
					        cdef numpy.ndarray[long, ndim=2] output
 | 
				
			||||||
                                        itemsize=sizeof(long), format="l")
 | 
					        # Make an array from the attributes --- otherwise our inner loop is Python
 | 
				
			||||||
 | 
					        # dict iteration.
 | 
				
			||||||
 | 
					        cdef numpy.ndarray[long, ndim=1] attr_ids = numpy.asarray(py_attr_ids)
 | 
				
			||||||
 | 
					        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int)
 | 
				
			||||||
        for i in range(self.length):
 | 
					        for i in range(self.length):
 | 
				
			||||||
            for j, feature in enumerate(attr_ids):
 | 
					            for j, feature in enumerate(attr_ids):
 | 
				
			||||||
                output[i, j] = get_token_attr(&self.data[i], feature)
 | 
					                output[i, j] = get_token_attr(&self.data[i], feature)
 | 
				
			||||||
| 
						 | 
					@ -232,6 +237,7 @@ cdef class Token:
 | 
				
			||||||
        self.sentiment = t.lex.sentiment
 | 
					        self.sentiment = t.lex.sentiment
 | 
				
			||||||
        self.flags = t.lex.flags
 | 
					        self.flags = t.lex.flags
 | 
				
			||||||
        self.lemma = t.lemma
 | 
					        self.lemma = t.lemma
 | 
				
			||||||
 | 
					        self.pos = t.pos
 | 
				
			||||||
        self.tag = t.tag
 | 
					        self.tag = t.tag
 | 
				
			||||||
        self.dep = t.dep
 | 
					        self.dep = t.dep
 | 
				
			||||||
        self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
 | 
					        self.repvec = numpy.asarray(<float[:300,]> t.lex.repvec)
 | 
				
			||||||
| 
						 | 
					@ -248,6 +254,24 @@ cdef class Token:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        return self._seq.data[self.i].lex.length
 | 
					        return self._seq.data[self.i].lex.length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def nbor(self, int i=1):
 | 
				
			||||||
 | 
					        return Token(self._seq, self.i + i)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def child(self, int i=1):
 | 
				
			||||||
 | 
					        cdef const TokenC* t = &self._seq.data[self.i]
 | 
				
			||||||
 | 
					        if i == 0:
 | 
				
			||||||
 | 
					            return self
 | 
				
			||||||
 | 
					        elif i >= 1:
 | 
				
			||||||
 | 
					            if t.r_kids == 0:
 | 
				
			||||||
 | 
					                return None
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                return Token(self._seq, _nth_significant_bit(t.r_kids, i))
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            if t.l_kids == 0:
 | 
				
			||||||
 | 
					                return None
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                return Token(self._seq, _nth_significant_bit(t.l_kids, i))
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
    property head:
 | 
					    property head:
 | 
				
			||||||
        """The token predicted by the parser to be the head of the current token."""
 | 
					        """The token predicted by the parser to be the head of the current token."""
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
| 
						 | 
					@ -290,10 +314,26 @@ cdef class Token:
 | 
				
			||||||
            cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
 | 
					            cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
 | 
				
			||||||
            return py_ustr
 | 
					            return py_ustr
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    property pos_:
 | 
				
			||||||
 | 
					        def __get__(self):
 | 
				
			||||||
 | 
					            id_to_string = {id_: string for string, id_ in UNIV_TAG_NAMES.items()}
 | 
				
			||||||
 | 
					            return id_to_string[self.pos]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property tag_:
 | 
					    property tag_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self._seq.tag_names[self.tag]
 | 
					            return self._seq._tag_strings[self.tag]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property dep_:
 | 
					    property dep_:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self._seq.dep_names[self.dep]
 | 
					            return self._seq._dep_strings[self.dep]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					cdef inline uint32_t _nth_significant_bit(uint32_t bits, int n) nogil:
 | 
				
			||||||
 | 
					    cdef int i
 | 
				
			||||||
 | 
					    for i in range(32):
 | 
				
			||||||
 | 
					        if bits & (1 << i):
 | 
				
			||||||
 | 
					            n -= 1
 | 
				
			||||||
 | 
					            if n < 1:
 | 
				
			||||||
 | 
					                return i
 | 
				
			||||||
 | 
					    return 0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,6 @@
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
UNIV_TAG_NAMES = {
 | 
					UNIV_TAG_NAMES = {
 | 
				
			||||||
    "NO_TAG": NO_TAG,
 | 
					    "NO_TAG": NO_TAG,
 | 
				
			||||||
    "ADJ": ADJ,
 | 
					    "ADJ": ADJ,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user