mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Bug fixes to sentences method, and improved vector transport for tokens
This commit is contained in:
		
							parent
							
								
									f2a229136c
								
							
						
					
					
						commit
						d6ac60e91c
					
				| 
						 | 
					@ -63,8 +63,10 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
 | 
				
			||||||
cdef class Tokens:
 | 
					cdef class Tokens:
 | 
				
			||||||
    """Access and set annotations onto some text.
 | 
					    """Access and set annotations onto some text.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    def __init__(self, Vocab vocab, string_length=0):
 | 
					    def __init__(self, Vocab vocab, unicode string):
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
 | 
					        self._string = string
 | 
				
			||||||
 | 
					        string_length = len(string)
 | 
				
			||||||
        if string_length >= 3:
 | 
					        if string_length >= 3:
 | 
				
			||||||
            size = int(string_length / 3.0)
 | 
					            size = int(string_length / 3.0)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
| 
						 | 
					@ -84,16 +86,18 @@ cdef class Tokens:
 | 
				
			||||||
    def sentences(self):
 | 
					    def sentences(self):
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
        sentences = []
 | 
					        sentences = []
 | 
				
			||||||
        sent = Tokens(self.vocab)
 | 
					        cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
 | 
				
			||||||
        cdef attr_t period = self.vocab.strings['.']
 | 
					        cdef attr_t period = self.vocab.strings['.']
 | 
				
			||||||
        cdef attr_t question = self.vocab.strings['?']
 | 
					        cdef attr_t question = self.vocab.strings['?']
 | 
				
			||||||
        cdef attr_t exclamation = self.vocab.strings['!']
 | 
					        cdef attr_t exclamation = self.vocab.strings['!']
 | 
				
			||||||
        for i in range(self.length):
 | 
					        for i in range(self.length):
 | 
				
			||||||
            idx = sent.push_back(idx, &self.data[i])
 | 
					            sent.push_back(self.data[i].idx, &self.data[i])
 | 
				
			||||||
            if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \
 | 
					            if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \
 | 
				
			||||||
              self.data[i].lex.sic == question:
 | 
					              self.data[i].lex.sic == question:
 | 
				
			||||||
                sentences.append(sent)
 | 
					                sentences.append(sent)
 | 
				
			||||||
                sent = Tokens(self.vocab)
 | 
					                sent = Tokens(self.vocab, self._string[self.data[i].idx:])
 | 
				
			||||||
 | 
					        if sent.length:
 | 
				
			||||||
 | 
					            sentences.append(sent)
 | 
				
			||||||
        return sentences
 | 
					        return sentences
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __getitem__(self, i):
 | 
					    def __getitem__(self, i):
 | 
				
			||||||
| 
						 | 
					@ -119,6 +123,10 @@ cdef class Tokens:
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self):
 | 
				
			||||||
        return self.length
 | 
					        return self.length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def __unicode__(self):
 | 
				
			||||||
 | 
					        cdef const TokenC* last = &self.data[self.length - 1]
 | 
				
			||||||
 | 
					        return self._string[:last.idx + last.lex.length]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
 | 
					    cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
 | 
				
			||||||
        if self.length == self.max_length:
 | 
					        if self.length == self.max_length:
 | 
				
			||||||
            self._realloc(self.length * 2)
 | 
					            self._realloc(self.length * 2)
 | 
				
			||||||
| 
						 | 
					@ -221,9 +229,10 @@ cdef class Token:
 | 
				
			||||||
        self.tag = t.tag
 | 
					        self.tag = t.tag
 | 
				
			||||||
        self.dep = t.dep
 | 
					        self.dep = t.dep
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.vec = numpy.ndarray(shape=(300,), dtype=numpy.float32)
 | 
					        #self.vec = numpy.ndarray(shape=(300,), dtype=numpy.float32)
 | 
				
			||||||
        for i in range(300):
 | 
					        #for i in range(300):
 | 
				
			||||||
            self.vec[i] = t.lex.vec[i]
 | 
					        #    self.vec[i] = t.lex.vec[i]
 | 
				
			||||||
 | 
					        self.vec = numpy.asarray(<float[:300,]> t.lex.vec)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __unicode__(self):
 | 
					    def __unicode__(self):
 | 
				
			||||||
        cdef const TokenC* t = &self._seq.data[self.i]
 | 
					        cdef const TokenC* t = &self._seq.data[self.i]
 | 
				
			||||||
| 
						 | 
					@ -247,7 +256,7 @@ cdef class Token:
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def is_pos(self, univ_tag_t pos):
 | 
					    def is_pos(self, univ_tag_t pos):
 | 
				
			||||||
        return False
 | 
					        return self.tag == pos
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property head:
 | 
					    property head:
 | 
				
			||||||
        """The token predicted by the parser to be the head of the current token."""
 | 
					        """The token predicted by the parser to be the head of the current token."""
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user