mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Docs coming together
This commit is contained in:
		
							parent
							
								
									c282e6d5fb
								
							
						
					
					
						commit
						45a22d6b2c
					
				| 
						 | 
					@ -9,10 +9,14 @@ spaCy NLP Tokenizer and Lexicon
 | 
				
			||||||
.. toctree::
 | 
					.. toctree::
 | 
				
			||||||
    :maxdepth: 3
 | 
					    :maxdepth: 3
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    guide/overview
 | 
					    guide/overview.rst
 | 
				
			||||||
    guide/install
 | 
					    guide/install.rst
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    api/index.rst
 | 
					    api/index.rst
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    modules/index.rst
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Source (GitHub)
 | 
					Source (GitHub)
 | 
				
			||||||
----------------
 | 
					----------------
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,4 +4,4 @@ cimport cython
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class English(Language):
 | 
					cdef class English(Language):
 | 
				
			||||||
    cpdef int _split_one(self, unicode word)
 | 
					    cdef int _split_one(self, unicode word)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										16
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								spacy/en.pyx
									
									
									
									
									
								
							| 
						 | 
					@ -5,22 +5,21 @@ scheme in several important respects:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* Whitespace is added as tokens, except for single spaces. e.g.,
 | 
					* Whitespace is added as tokens, except for single spaces. e.g.,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    >>> [w.string for w in tokenize(u'\\nHello  \\tThere')]
 | 
					    >>> [w.string for w in EN.tokenize(u'\\nHello  \\tThere')]
 | 
				
			||||||
    [u'\\n', u'Hello', u' ', u'\\t', u'There']
 | 
					    [u'\\n', u'Hello', u' ', u'\\t', u'There']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* Contractions are normalized, e.g.
 | 
					* Contractions are normalized, e.g.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    >>> [w.string for w in u"isn't ain't won't he's")]
 | 
					    >>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")]
 | 
				
			||||||
    [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
 | 
					    [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"]
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
* Hyphenated words are split, with the hyphen preserved, e.g.:
 | 
					* Hyphenated words are split, with the hyphen preserved, e.g.:
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    >>> [w.string for w in tokenize(u'New York-based')]
 | 
					    >>> [w.string for w in EN.tokenize(u'New York-based')]
 | 
				
			||||||
    [u'New', u'York', u'-', u'based']
 | 
					    [u'New', u'York', u'-', u'based']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Other improvements:
 | 
					Other improvements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* Full unicode support
 | 
					 | 
				
			||||||
* Email addresses, URLs, European-formatted dates and other numeric entities not
 | 
					* Email addresses, URLs, European-formatted dates and other numeric entities not
 | 
				
			||||||
  found in the PTB are tokenized correctly
 | 
					  found in the PTB are tokenized correctly
 | 
				
			||||||
* Heuristic handling of word-final periods (PTB expects sentence boundary detection
 | 
					* Heuristic handling of word-final periods (PTB expects sentence boundary detection
 | 
				
			||||||
| 
						 | 
					@ -81,6 +80,13 @@ CAN_PRT = NR_FLAGS; NR_FLAGS += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class English(Language):
 | 
					cdef class English(Language):
 | 
				
			||||||
 | 
					    """English tokenizer, tightly coupled to lexicon.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Attributes:
 | 
				
			||||||
 | 
					        name (unicode): The two letter code used by Wikipedia for the language.
 | 
				
			||||||
 | 
					        lexicon (Lexicon): The lexicon. Exposes the lookup method.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __cinit__(self, name):
 | 
					    def __cinit__(self, name):
 | 
				
			||||||
        flag_funcs = [0 for _ in range(NR_FLAGS)]
 | 
					        flag_funcs = [0 for _ in range(NR_FLAGS)]
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
| 
						 | 
					@ -110,7 +116,7 @@ cdef class English(Language):
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        Language.__init__(self, name, flag_funcs)
 | 
					        Language.__init__(self, name, flag_funcs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef int _split_one(self, unicode word):
 | 
					    cdef int _split_one(self, unicode word):
 | 
				
			||||||
        cdef size_t length = len(word)
 | 
					        cdef size_t length = len(word)
 | 
				
			||||||
        cdef int i = 0
 | 
					        cdef int i = 0
 | 
				
			||||||
        if word.startswith("'s") or word.startswith("'S"):
 | 
					        if word.startswith("'s") or word.startswith("'S"):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,21 +4,22 @@ from spacy.word cimport Lexeme
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Lexicon:
 | 
					cdef class Lexicon:
 | 
				
			||||||
    cdef list string_features
 | 
					 | 
				
			||||||
    cdef list flag_features
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef dict _dict
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cpdef Lexeme lookup(self, unicode string)
 | 
					    cpdef Lexeme lookup(self, unicode string)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    cdef dict _dict
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    cdef list _string_features
 | 
				
			||||||
 | 
					    cdef list _flag_features
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Language:
 | 
					cdef class Language:
 | 
				
			||||||
    cdef object name
 | 
					    cdef unicode name
 | 
				
			||||||
    cdef dict cache
 | 
					    cdef dict cache
 | 
				
			||||||
    cpdef readonly Lexicon lexicon
 | 
					    cpdef readonly Lexicon lexicon
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef list tokenize(self, unicode text)
 | 
					    cpdef list tokenize(self, unicode text)
 | 
				
			||||||
 | 
					    cpdef Lexeme lookup(self, unicode text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef list _tokenize(self, unicode string)
 | 
					    cdef list _tokenize(self, unicode string)
 | 
				
			||||||
    cpdef list _split(self, unicode string)
 | 
					    cdef list _split(self, unicode string)
 | 
				
			||||||
    cpdef int _split_one(self, unicode word)
 | 
					    cdef int _split_one(self, unicode word)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -41,7 +41,7 @@ cdef class Language:
 | 
				
			||||||
        rules, words, probs, clusters, case_stats, tag_stats = lang_data
 | 
					        rules, words, probs, clusters, case_stats, tag_stats = lang_data
 | 
				
			||||||
        self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
 | 
					        self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
 | 
				
			||||||
                               string_features, flag_features)
 | 
					                               string_features, flag_features)
 | 
				
			||||||
        self.load_special_tokenization(rules)
 | 
					        self._load_special_tokenization(rules)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef list tokenize(self, unicode string):
 | 
					    cpdef list tokenize(self, unicode string):
 | 
				
			||||||
        """Tokenize a string.
 | 
					        """Tokenize a string.
 | 
				
			||||||
| 
						 | 
					@ -75,6 +75,17 @@ cdef class Language:
 | 
				
			||||||
        assert tokens
 | 
					        assert tokens
 | 
				
			||||||
        return tokens
 | 
					        return tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cpdef Lexeme lookup(self, unicode string):
 | 
				
			||||||
 | 
					        """Retrieve (or create, if not found) a Lexeme for a string, and return it.
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					        Args:
 | 
				
			||||||
 | 
					            string (unicode): The string to be looked up. Must be unicode, not bytes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Returns:
 | 
				
			||||||
 | 
					            lexeme (Lexeme): A reference to a lexical type.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        return self.lexicon.lookup(string)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef list _tokenize(self, unicode string):
 | 
					    cdef list _tokenize(self, unicode string):
 | 
				
			||||||
        if string in self.cache:
 | 
					        if string in self.cache:
 | 
				
			||||||
            return self.cache[string]
 | 
					            return self.cache[string]
 | 
				
			||||||
| 
						 | 
					@ -85,7 +96,7 @@ cdef class Language:
 | 
				
			||||||
        self.cache[string] = lexemes
 | 
					        self.cache[string] = lexemes
 | 
				
			||||||
        return lexemes
 | 
					        return lexemes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef list _split(self, unicode string):
 | 
					    cdef list _split(self, unicode string):
 | 
				
			||||||
        """Find how to split a contiguous span of non-space characters into substrings.
 | 
					        """Find how to split a contiguous span of non-space characters into substrings.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        This method calls find_split repeatedly. Most languages will want to
 | 
					        This method calls find_split repeatedly. Most languages will want to
 | 
				
			||||||
| 
						 | 
					@ -107,10 +118,10 @@ cdef class Language:
 | 
				
			||||||
            string = string[split:]
 | 
					            string = string[split:]
 | 
				
			||||||
        return substrings
 | 
					        return substrings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef int _split_one(self, unicode word):
 | 
					    cdef int _split_one(self, unicode word):
 | 
				
			||||||
        return len(word)
 | 
					        return len(word)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def load_special_tokenization(self, token_rules):
 | 
					    def _load_special_tokenization(self, token_rules):
 | 
				
			||||||
        '''Load special-case tokenization rules.
 | 
					        '''Load special-case tokenization rules.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Loads special-case tokenization rules into the Language.cache cache,
 | 
					        Loads special-case tokenization rules into the Language.cache cache,
 | 
				
			||||||
| 
						 | 
					@ -132,14 +143,14 @@ cdef class Language:
 | 
				
			||||||
cdef class Lexicon:
 | 
					cdef class Lexicon:
 | 
				
			||||||
    def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
 | 
					    def __cinit__(self, words, probs, clusters, case_stats, tag_stats,
 | 
				
			||||||
                  string_features, flag_features):
 | 
					                  string_features, flag_features):
 | 
				
			||||||
        self.flag_features = flag_features
 | 
					        self._flag_features = flag_features
 | 
				
			||||||
        self.string_features = string_features
 | 
					        self._string_features = string_features
 | 
				
			||||||
        self._dict = {}
 | 
					        self._dict = {}
 | 
				
			||||||
        cdef Lexeme word
 | 
					        cdef Lexeme word
 | 
				
			||||||
        for string in words:
 | 
					        for string in words:
 | 
				
			||||||
            word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
 | 
					            word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0),
 | 
				
			||||||
                          case_stats.get(string, {}), tag_stats.get(string, {}),
 | 
					                          case_stats.get(string, {}), tag_stats.get(string, {}),
 | 
				
			||||||
                          self.string_features, self.flag_features)
 | 
					                          self._string_features, self._flag_features)
 | 
				
			||||||
            self._dict[string] = word
 | 
					            self._dict[string] = word
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef Lexeme lookup(self, unicode string):
 | 
					    cpdef Lexeme lookup(self, unicode string):
 | 
				
			||||||
| 
						 | 
					@ -155,7 +166,7 @@ cdef class Lexicon:
 | 
				
			||||||
        if string in self._dict:
 | 
					        if string in self._dict:
 | 
				
			||||||
            return self._dict[string]
 | 
					            return self._dict[string]
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self.string_features,
 | 
					        cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features,
 | 
				
			||||||
                                  self.flag_features)
 | 
					                                  self._flag_features)
 | 
				
			||||||
        self._dict[string] = word
 | 
					        self._dict[string] = word
 | 
				
			||||||
        return word
 | 
					        return word
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user