mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Docs coming together
This commit is contained in:
		
							parent
							
								
									c282e6d5fb
								
							
						
					
					
						commit
						45a22d6b2c
					
				|  | @ -9,10 +9,14 @@ spaCy NLP Tokenizer and Lexicon | |||
| .. toctree:: | ||||
|     :maxdepth: 3 | ||||
|      | ||||
|     guide/overview | ||||
|     guide/install | ||||
|     guide/overview.rst | ||||
|     guide/install.rst | ||||
| 
 | ||||
|     api/index.rst | ||||
| 
 | ||||
|     modules/index.rst | ||||
|      | ||||
| 
 | ||||
| Source (GitHub) | ||||
| ---------------- | ||||
| 
 | ||||
|  |  | |||
|  | @ -4,4 +4,4 @@ cimport cython | |||
| 
 | ||||
| 
 | ||||
| cdef class English(Language): | ||||
|     cpdef int _split_one(self, unicode word) | ||||
|     cdef int _split_one(self, unicode word) | ||||
|  |  | |||
							
								
								
									
										16
									
								
								spacy/en.pyx
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								spacy/en.pyx
									
									
									
									
									
								
							|  | @ -5,22 +5,21 @@ scheme in several important respects: | |||
| 
 | ||||
| * Whitespace is added as tokens, except for single spaces. e.g., | ||||
| 
 | ||||
|     >>> [w.string for w in tokenize(u'\\nHello  \\tThere')] | ||||
|     >>> [w.string for w in EN.tokenize(u'\\nHello  \\tThere')] | ||||
|     [u'\\n', u'Hello', u' ', u'\\t', u'There'] | ||||
| 
 | ||||
| * Contractions are normalized, e.g. | ||||
| 
 | ||||
|     >>> [w.string for w in u"isn't ain't won't he's")] | ||||
|     >>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")] | ||||
|     [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"] | ||||
|    | ||||
| * Hyphenated words are split, with the hyphen preserved, e.g.: | ||||
|      | ||||
|     >>> [w.string for w in tokenize(u'New York-based')] | ||||
|     >>> [w.string for w in EN.tokenize(u'New York-based')] | ||||
|     [u'New', u'York', u'-', u'based'] | ||||
| 
 | ||||
| Other improvements: | ||||
| 
 | ||||
| * Full unicode support | ||||
| * Email addresses, URLs, European-formatted dates and other numeric entities not | ||||
|   found in the PTB are tokenized correctly | ||||
| * Heuristic handling of word-final periods (PTB expects sentence boundary detection | ||||
|  | @ -81,6 +80,13 @@ CAN_PRT = NR_FLAGS; NR_FLAGS += 1 | |||
| 
 | ||||
| 
 | ||||
| cdef class English(Language): | ||||
|     """English tokenizer, tightly coupled to lexicon. | ||||
| 
 | ||||
|     Attributes: | ||||
|         name (unicode): The two letter code used by Wikipedia for the language. | ||||
|         lexicon (Lexicon): The lexicon. Exposes the lookup method. | ||||
|     """ | ||||
| 
 | ||||
|     def __cinit__(self, name): | ||||
|         flag_funcs = [0 for _ in range(NR_FLAGS)] | ||||
|          | ||||
|  | @ -110,7 +116,7 @@ cdef class English(Language): | |||
|          | ||||
|         Language.__init__(self, name, flag_funcs) | ||||
| 
 | ||||
|     cpdef int _split_one(self, unicode word): | ||||
|     cdef int _split_one(self, unicode word): | ||||
|         cdef size_t length = len(word) | ||||
|         cdef int i = 0 | ||||
|         if word.startswith("'s") or word.startswith("'S"): | ||||
|  |  | |||
|  | @ -4,21 +4,22 @@ from spacy.word cimport Lexeme | |||
| 
 | ||||
| 
 | ||||
| cdef class Lexicon: | ||||
|     cdef list string_features | ||||
|     cdef list flag_features | ||||
| 
 | ||||
|     cdef dict _dict | ||||
| 
 | ||||
|     cpdef Lexeme lookup(self, unicode string) | ||||
|      | ||||
|     cdef dict _dict | ||||
|      | ||||
|     cdef list _string_features | ||||
|     cdef list _flag_features | ||||
| 
 | ||||
| 
 | ||||
| cdef class Language: | ||||
|     cdef object name | ||||
|     cdef unicode name | ||||
|     cdef dict cache | ||||
|     cpdef readonly Lexicon lexicon | ||||
| 
 | ||||
|     cpdef list tokenize(self, unicode text) | ||||
|     cpdef Lexeme lookup(self, unicode text) | ||||
| 
 | ||||
|     cdef list _tokenize(self, unicode string) | ||||
|     cpdef list _split(self, unicode string) | ||||
|     cpdef int _split_one(self, unicode word) | ||||
|     cdef list _split(self, unicode string) | ||||
|     cdef int _split_one(self, unicode word) | ||||
|  |  | |||
|  | @ -41,7 +41,7 @@ cdef class Language: | |||
|         rules, words, probs, clusters, case_stats, tag_stats = lang_data | ||||
|         self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, | ||||
|                                string_features, flag_features) | ||||
|         self.load_special_tokenization(rules) | ||||
|         self._load_special_tokenization(rules) | ||||
| 
 | ||||
|     cpdef list tokenize(self, unicode string): | ||||
|         """Tokenize a string. | ||||
|  | @ -75,6 +75,17 @@ cdef class Language: | |||
|         assert tokens | ||||
|         return tokens | ||||
| 
 | ||||
|     cpdef Lexeme lookup(self, unicode string): | ||||
|         """Retrieve (or create, if not found) a Lexeme for a string, and return it. | ||||
|      | ||||
|         Args: | ||||
|             string (unicode): The string to be looked up. Must be unicode, not bytes. | ||||
| 
 | ||||
|         Returns: | ||||
|             lexeme (Lexeme): A reference to a lexical type. | ||||
|         """ | ||||
|         return self.lexicon.lookup(string) | ||||
| 
 | ||||
|     cdef list _tokenize(self, unicode string): | ||||
|         if string in self.cache: | ||||
|             return self.cache[string] | ||||
|  | @ -85,7 +96,7 @@ cdef class Language: | |||
|         self.cache[string] = lexemes | ||||
|         return lexemes | ||||
| 
 | ||||
|     cpdef list _split(self, unicode string): | ||||
|     cdef list _split(self, unicode string): | ||||
|         """Find how to split a contiguous span of non-space characters into substrings. | ||||
| 
 | ||||
|         This method calls find_split repeatedly. Most languages will want to | ||||
|  | @ -107,10 +118,10 @@ cdef class Language: | |||
|             string = string[split:] | ||||
|         return substrings | ||||
| 
 | ||||
|     cpdef int _split_one(self, unicode word): | ||||
|     cdef int _split_one(self, unicode word): | ||||
|         return len(word) | ||||
| 
 | ||||
|     def load_special_tokenization(self, token_rules): | ||||
|     def _load_special_tokenization(self, token_rules): | ||||
|         '''Load special-case tokenization rules. | ||||
| 
 | ||||
|         Loads special-case tokenization rules into the Language.cache cache, | ||||
|  | @ -132,14 +143,14 @@ cdef class Language: | |||
| cdef class Lexicon: | ||||
|     def __cinit__(self, words, probs, clusters, case_stats, tag_stats, | ||||
|                   string_features, flag_features): | ||||
|         self.flag_features = flag_features | ||||
|         self.string_features = string_features | ||||
|         self._flag_features = flag_features | ||||
|         self._string_features = string_features | ||||
|         self._dict = {} | ||||
|         cdef Lexeme word | ||||
|         for string in words: | ||||
|             word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0), | ||||
|                           case_stats.get(string, {}), tag_stats.get(string, {}), | ||||
|                           self.string_features, self.flag_features) | ||||
|                           self._string_features, self._flag_features) | ||||
|             self._dict[string] = word | ||||
| 
 | ||||
|     cpdef Lexeme lookup(self, unicode string): | ||||
|  | @ -155,7 +166,7 @@ cdef class Lexicon: | |||
|         if string in self._dict: | ||||
|             return self._dict[string] | ||||
|          | ||||
|         cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self.string_features, | ||||
|                                   self.flag_features) | ||||
|         cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features, | ||||
|                                   self._flag_features) | ||||
|         self._dict[string] = word | ||||
|         return word | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user