mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'master' of github.com:spacy-io/spaCy
This commit is contained in:
		
						commit
						963570aa49
					
				| 
						 | 
				
			
			@ -129,7 +129,7 @@ def _read_freqs(loc, max_length=100, min_doc_freq=0, min_freq=200):
 | 
			
		|||
            word = key
 | 
			
		||||
            smooth_count = counts.smoother(int(freq))
 | 
			
		||||
            log_smooth_count = math.log(smooth_count)
 | 
			
		||||
            probs[word] = math.log(smooth_count) - log_total
 | 
			
		||||
            probs[word] = log_smooth_count - log_total
 | 
			
		||||
    oov_prob = math.log(counts.smoother(0)) - log_total
 | 
			
		||||
    return probs, oov_prob
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -54,7 +54,7 @@ def represent_word(word):
 | 
			
		|||
    # Only do this if the lower-cased form is more probable.
 | 
			
		||||
    if text.istitle() \
 | 
			
		||||
    and is_sent_begin(word) \
 | 
			
		||||
    and word.prob < word.vocab[text.lower()].prob:
 | 
			
		||||
    and word.prob < word.doc.vocab[text.lower()].prob:
 | 
			
		||||
        text = text.lower()
 | 
			
		||||
    return text + '|' + word.tag_
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -25,7 +25,7 @@ cdef class Tokenizer:
 | 
			
		|||
 | 
			
		||||
    cdef int _try_cache(self, hash_t key, Doc tokens) except -1
 | 
			
		||||
    cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
 | 
			
		||||
    cdef unicode _split_affixes(self, unicode string, vector[LexemeC*] *prefixes,
 | 
			
		||||
    cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes,
 | 
			
		||||
                             vector[LexemeC*] *suffixes)
 | 
			
		||||
    cdef int _attach_tokens(self, Doc tokens, unicode string,
 | 
			
		||||
                            vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -155,11 +155,11 @@ cdef class Tokenizer:
 | 
			
		|||
        cdef vector[LexemeC*] suffixes
 | 
			
		||||
        cdef int orig_size
 | 
			
		||||
        orig_size = tokens.length
 | 
			
		||||
        span = self._split_affixes(span, &prefixes, &suffixes)
 | 
			
		||||
        span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes)
 | 
			
		||||
        self._attach_tokens(tokens, span, &prefixes, &suffixes)
 | 
			
		||||
        self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
 | 
			
		||||
 | 
			
		||||
    cdef unicode _split_affixes(self, unicode string, vector[const LexemeC*] *prefixes,
 | 
			
		||||
    cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes,
 | 
			
		||||
                                vector[const LexemeC*] *suffixes):
 | 
			
		||||
        cdef size_t i
 | 
			
		||||
        cdef unicode prefix
 | 
			
		||||
| 
						 | 
				
			
			@ -176,7 +176,7 @@ cdef class Tokenizer:
 | 
			
		|||
                # Check whether we've hit a special-case
 | 
			
		||||
                if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
 | 
			
		||||
                    string = minus_pre
 | 
			
		||||
                    prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
 | 
			
		||||
                    prefixes.push_back(self.vocab.get(mem, prefix))
 | 
			
		||||
                    break
 | 
			
		||||
            suf_len = self.find_suffix(string)
 | 
			
		||||
            if suf_len != 0:
 | 
			
		||||
| 
						 | 
				
			
			@ -185,18 +185,18 @@ cdef class Tokenizer:
 | 
			
		|||
                # Check whether we've hit a special-case
 | 
			
		||||
                if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
 | 
			
		||||
                    string = minus_suf
 | 
			
		||||
                    suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
 | 
			
		||||
                    suffixes.push_back(self.vocab.get(mem, suffix))
 | 
			
		||||
                    break
 | 
			
		||||
            if pre_len and suf_len and (pre_len + suf_len) <= len(string):
 | 
			
		||||
                string = string[pre_len:-suf_len]
 | 
			
		||||
                prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
 | 
			
		||||
                suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
 | 
			
		||||
                prefixes.push_back(self.vocab.get(mem, prefix))
 | 
			
		||||
                suffixes.push_back(self.vocab.get(mem, suffix))
 | 
			
		||||
            elif pre_len:
 | 
			
		||||
                string = minus_pre
 | 
			
		||||
                prefixes.push_back(self.vocab.get(self.vocab.mem, prefix))
 | 
			
		||||
                prefixes.push_back(self.vocab.get(mem, prefix))
 | 
			
		||||
            elif suf_len:
 | 
			
		||||
                string = minus_suf
 | 
			
		||||
                suffixes.push_back(self.vocab.get(self.vocab.mem, suffix))
 | 
			
		||||
                suffixes.push_back(self.vocab.get(mem, suffix))
 | 
			
		||||
            if string and (self._specials.get(hash_string(string)) != NULL):
 | 
			
		||||
                break
 | 
			
		||||
        return string
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user