mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Upd docstrings
This commit is contained in:
		
							parent
							
								
									91a5064b7f
								
							
						
					
					
						commit
						bb80937544
					
				|  | @ -16,16 +16,64 @@ def get_lex_props(string): | |||
| 
 | ||||
| 
 | ||||
| class English(object): | ||||
|     def __init__(self, data_dir=None, tag=True, parse=False): | ||||
|     """The English NLP pipeline. | ||||
| 
 | ||||
|     Provides a tokenizer, lexicon, part-of-speech tagger and parser. | ||||
| 
 | ||||
|     Keyword args: | ||||
|         data_dir (unicode): A path to a directory, from which to load the pipeline. | ||||
|             If None, looks for a directory named "data/" in the same directory as | ||||
|             the present file, i.e. path.join(path.dirname(__file__, 'data')). | ||||
|             If path.join(data_dir, 'pos') exists, the tagger is loaded from it. | ||||
|             If path.join(data_dir, 'deps') exists, the parser is loaded from it. | ||||
|             See Pipeline Directory Structure for details. | ||||
| 
 | ||||
|     Attributes: | ||||
|         vocab (spacy.vocab.Vocab): The lexicon. | ||||
| 
 | ||||
|         strings (spacy.strings.StringStore): Encode/decode strings to/from integer IDs. | ||||
| 
 | ||||
|         tokenizer (spacy.tokenizer.Tokenizer): The start of the pipeline. | ||||
| 
 | ||||
|         tagger (spacy.en.pos.EnPosTagger): | ||||
|             The part-of-speech tagger, which also performs lemmatization and | ||||
|             morphological analysis. | ||||
| 
 | ||||
|         parser (spacy.syntax.parser.GreedyParser): | ||||
|             A greedy shift-reduce dependency parser. | ||||
| 
 | ||||
| 
 | ||||
|     """ | ||||
|     def __init__(self, data_dir=None): | ||||
|         if data_dir is None: | ||||
|             data_dir = path.join(path.dirname(__file__), 'data') | ||||
|         self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props) | ||||
|         self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir) | ||||
|         self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None | ||||
|         self.parser = GreedyParser(path.join(data_dir, 'deps')) if parse else None | ||||
|         if path.exists(path.join(data_dir, 'pos')): | ||||
|             self.tagger = EnPosTagger(self.vocab.strings, data_dir) | ||||
|         else: | ||||
|             self.tagger = None | ||||
|         if path.exists(path.join(data_dir, 'deps')): | ||||
|             self.parser = GreedyParser(path.join(data_dir, 'deps')) | ||||
|         else: | ||||
|             self.parser = None | ||||
|         self.strings = self.vocab.strings | ||||
| 
 | ||||
|     def __call__(self, text, tag=True, parse=True): | ||||
|         """Apply the pipeline to some text. | ||||
|          | ||||
|         Args: | ||||
|             text (unicode): The text to be processed. | ||||
| 
 | ||||
|         Keyword args: | ||||
|             tag (bool): Whether to add part-of-speech tags to the text.  This | ||||
|                 will also set morphological analysis and lemmas. | ||||
| 
 | ||||
|             parse (bool): Whether to add dependency-heads and labels to the text. | ||||
| 
 | ||||
|         Returns: | ||||
|             tokens (spacy.tokens.Tokens): | ||||
|         """ | ||||
|         tokens = self.tokenizer.tokenize(text) | ||||
|         if self.tagger and tag: | ||||
|             self.tagger(tokens) | ||||
|  | @ -35,7 +83,10 @@ class English(object): | |||
| 
 | ||||
|     @property | ||||
|     def tags(self): | ||||
|         """List of part-of-speech tag names.""" | ||||
|         if self.tagger is None: | ||||
|             return [] | ||||
|         else: | ||||
|             return self.tagger.tag_names | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -204,6 +204,7 @@ cdef struct _CachedMorph: | |||
| 
 | ||||
| 
 | ||||
| cdef class EnPosTagger(Tagger): | ||||
|     """A part-of-speech tagger for English""" | ||||
|     def __init__(self, StringStore strings, data_dir): | ||||
|         model_dir = path.join(data_dir, 'pos') | ||||
|         Tagger.__init__(self, path.join(model_dir)) | ||||
|  | @ -224,6 +225,11 @@ cdef class EnPosTagger(Tagger): | |||
|         self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) | ||||
| 
 | ||||
|     def __call__(self, Tokens tokens): | ||||
|         """Apply the tagger, setting the POS tags onto the Tokens object. | ||||
| 
 | ||||
|         Args: | ||||
|             tokens (Tokens): The tokens to be tagged. | ||||
|         """ | ||||
|         cdef int i | ||||
|         cdef atom_t[N_CONTEXT_FIELDS] context | ||||
|         cdef TokenC* t = tokens.data | ||||
|  |  | |||
|  | @ -13,7 +13,8 @@ from itertools import combinations | |||
| from ..tokens cimport TokenC | ||||
| from ._state cimport State | ||||
| from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2 | ||||
| from ._state cimport get_left, get_right | ||||
| from ._state cimport has_head, get_left, get_right | ||||
| from ._state cimport count_left_kids, count_right_kids | ||||
| 
 | ||||
| 
 | ||||
| cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: | ||||
|  | @ -24,10 +25,12 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: | |||
|         context[3] = 0 | ||||
|         context[4] = 0 | ||||
|         context[5] = 0 | ||||
|         context[6] = 0 | ||||
|     else: | ||||
|         context[0] = token.lex.sic | ||||
|         context[1] = token.pos | ||||
|         context[2] = token.lex.cluster | ||||
|         context[1] = token.lemma | ||||
|         context[2] = token.fine_pos | ||||
|         context[3] = token.lex.cluster | ||||
|         # We've read in the string little-endian, so now we can take & (2**n)-1 | ||||
|         # to get the first n bits of the cluster. | ||||
|         # e.g. s = "1110010101" | ||||
|  | @ -40,9 +43,9 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: | |||
|         # What we're doing here is picking a number where all bits are 1, e.g. | ||||
|         # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in | ||||
|         # the source that are set to 1. | ||||
|         context[3] = token.lex.cluster & 63 | ||||
|         context[4] = token.lex.cluster & 15 | ||||
|         context[5] = token.dep_tag | ||||
|         context[4] = token.lex.cluster & 63 | ||||
|         context[5] = token.lex.cluster & 15 | ||||
|         context[6] = token.dep_tag if has_head(token) else 0 | ||||
| 
 | ||||
| 
 | ||||
| cdef int fill_context(atom_t* context, State* state) except -1: | ||||
|  | @ -66,12 +69,148 @@ cdef int fill_context(atom_t* context, State* state) except -1: | |||
|         context[dist] = state.stack[0] - state.i | ||||
|     else: | ||||
|         context[dist] = 0 | ||||
|     context[N0lv] = 0  | ||||
|     context[S0lv] = 0 | ||||
|     context[S0rv] = 0 | ||||
|     context[S1lv] = 0 | ||||
|     context[S1rv] = 0 | ||||
|     context[N0lv] = max(count_left_kids(get_n0(state)), 5) | ||||
|     context[S0lv] = max(count_left_kids(get_s0(state)), 5) | ||||
|     context[S0rv] = max(count_right_kids(get_s0(state)), 5) | ||||
|     context[S1lv] = max(count_left_kids(get_s1(state)), 5) | ||||
|     context[S1rv] = max(count_right_kids(get_s1(state)), 5) | ||||
| 
 | ||||
|     context[S0_has_head] = 0 | ||||
|     context[S1_has_head] = 0 | ||||
|     context[S2_has_head] = 0 | ||||
|     if state.stack_len >= 1: | ||||
|         context[S0_has_head] = has_head(get_s0(state)) + 1 | ||||
|         if state.stack_len >= 2: | ||||
|             context[S1_has_head] = has_head(get_s1(state)) + 1 | ||||
|             if state.stack_len >= 3: | ||||
|                 context[S2_has_head] = has_head(get_s2(state)) | ||||
| 
 | ||||
| 
 | ||||
| unigrams = ( | ||||
|     (S2W, S2p), | ||||
|     (S2c6, S2p), | ||||
|      | ||||
|     (S1W, S1p), | ||||
|     (S1c6, S1p), | ||||
| 
 | ||||
|     (S0W, S0p), | ||||
|     (S0c6, S0p), | ||||
|   | ||||
|     (N0W, N0p), | ||||
|     (N0p,), | ||||
|     (N0c,), | ||||
|     (N0c6, N0p), | ||||
|     (N0L,), | ||||
|   | ||||
|     (N1W, N1p), | ||||
|     (N1c6, N1p), | ||||
|   | ||||
|     (N2W, N2p), | ||||
|     (N2c6, N2p), | ||||
| 
 | ||||
|     (S0r2W, S0r2p), | ||||
|     (S0r2c6, S0r2p), | ||||
|     (S0r2L,), | ||||
| 
 | ||||
|     (S0rW, S0rp), | ||||
|     (S0rc6, S0rp), | ||||
|     (S0rL,), | ||||
| 
 | ||||
|     (S0l2W, S0l2p), | ||||
|     (S0l2c6, S0l2p), | ||||
|     (S0l2L,), | ||||
| 
 | ||||
|     (S0lW, S0lp), | ||||
|     (S0lc6, S0lp), | ||||
|     (S0lL,), | ||||
| 
 | ||||
|     (N0l2W, N0l2p), | ||||
|     (N0l2c6, N0l2p), | ||||
|     (N0l2L,), | ||||
| 
 | ||||
|     (N0lW, N0lp), | ||||
|     (N0lc6, N0lp), | ||||
|     (N0lL,), | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| s0_n0 = ( | ||||
|     (S0W, S0p, N0W, N0p), | ||||
|     (S0c, S0p, N0c, N0p), | ||||
|     (S0c6, S0p, N0c6, N0p), | ||||
|     (S0c4, S0p, N0c4, N0p), | ||||
|     (S0p, N0p), | ||||
|     (S0W, N0p), | ||||
|     (S0p, N0W), | ||||
|     (S0W, N0c), | ||||
|     (S0c, N0W), | ||||
|     (S0p, N0c), | ||||
|     (S0c, N0p), | ||||
|     (S0W, S0rp, N0p), | ||||
|     (S0p, S0rp, N0p), | ||||
|     (S0p, N0lp, N0W), | ||||
|     (S0p, N0lp, N0p), | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| s1_n0 = ( | ||||
|     (S1p, N0p), | ||||
|     (S1c, N0c), | ||||
|     (S1c, N0p), | ||||
|     (S1p, N0c), | ||||
|     (S1W, S1p, N0p), | ||||
|     (S1p, N0W, N0p), | ||||
|     (S1c6, S1p, N0c6, N0p), | ||||
| ) | ||||
| 
 | ||||
| 
 | ||||
| s0_n1 = ( | ||||
|     (S0p, N1p), | ||||
|     (S0c, N1c), | ||||
|     (S0c, N1p), | ||||
|     (S0p, N1c), | ||||
|     (S0W, S0p, N1p), | ||||
|     (S0p, N1W, N1p), | ||||
|     (S0c6, S0p, N1c6, N1p), | ||||
| ) | ||||
| 
 | ||||
| n0_n1 = ( | ||||
|     (N0W, N0p, N1W, N1p), | ||||
|     (N0W, N0p, N1p), | ||||
|     (N0p, N1W, N1p), | ||||
|     (N0c, N0p, N1c, N1p), | ||||
|     (N0c6, N0p, N1c6, N1p), | ||||
|     (N0c, N1c), | ||||
|     (N0p, N1c), | ||||
| ) | ||||
| 
 | ||||
| tree_shape = ( | ||||
|     (dist,), | ||||
|     (S0p, S0_has_head, S1_has_head, S2_has_head), | ||||
|     (S0p, S0lv, S0rv), | ||||
|     (N0p, N0lv), | ||||
| ) | ||||
| 
 | ||||
| trigrams = ( | ||||
|     (N0p, N1p, N2p), | ||||
|     (S0p, S0lp, S0l2p), | ||||
|     (S0p, S0rp, S0r2p), | ||||
|     (S0p, S1p, S2p), | ||||
|     (S1p, S0p, N0p), | ||||
|     (S0p, S0lp, N0p), | ||||
|     (S0p, N0p, N0lp), | ||||
|     (N0p, N0lp, N0l2p), | ||||
|      | ||||
|     (S0W, S0p, S0rL, S0r2L), | ||||
|     (S0p, S0rL, S0r2L), | ||||
| 
 | ||||
|     (S0W, S0p, S0lL, S0l2L), | ||||
|     (S0p, S0lL, S0l2L), | ||||
| 
 | ||||
|     (N0W, N0p, N0lL, N0l2L), | ||||
|     (N0p, N0lL, N0l2L), | ||||
| ) | ||||
|   | ||||
| 
 | ||||
| arc_eager = ( | ||||
|     (S0w, S0p), | ||||
|  | @ -86,7 +225,6 @@ arc_eager = ( | |||
|     (N2w, N2p), | ||||
|     (N2w,), | ||||
|     (N2p,), | ||||
| 
 | ||||
|     (S0w, S0p, N0w, N0p), | ||||
|     (S0w, S0p, N0w), | ||||
|     (S0w, N0w, N0p), | ||||
|  |  | |||
|  | @ -1,4 +1,5 @@ | |||
| # cython: profile=True | ||||
| 
 | ||||
| from preshed.maps cimport PreshMap | ||||
| from preshed.counter cimport PreshCounter | ||||
| 
 | ||||
|  | @ -59,13 +60,7 @@ cdef attr_t get_lex_attr(const Lexeme* lex, attr_id_t feat_name) nogil: | |||
| 
 | ||||
| 
 | ||||
| cdef class Tokens: | ||||
|     """A sequence of references to Lexeme objects. | ||||
| 
 | ||||
|     The Tokens class provides fast and memory-efficient access to lexical features, | ||||
|     and can efficiently export the data to a numpy array. | ||||
| 
 | ||||
|     >>> from spacy.en import EN | ||||
|     >>> tokens = EN.tokenize('An example sentence.') | ||||
|     """Access and set annotations onto some text. | ||||
|     """ | ||||
|     def __init__(self, Vocab vocab, string_length=0): | ||||
|         self.vocab = vocab | ||||
|  | @ -86,10 +81,20 @@ cdef class Tokens: | |||
|         self.length = 0 | ||||
| 
 | ||||
|     def __getitem__(self, i): | ||||
|         """Retrieve a token. | ||||
|          | ||||
|         Returns: | ||||
|             token (Token): | ||||
|         """ | ||||
|         bounds_check(i, self.length, PADDING) | ||||
|         return Token(self, i) | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         """Iterate over the tokens. | ||||
| 
 | ||||
|         Yields: | ||||
|             token (Token): | ||||
|         """ | ||||
|         for i in range(self.length): | ||||
|             yield self[i] | ||||
| 
 | ||||
|  | @ -148,6 +153,11 @@ cdef class Tokens: | |||
| 
 | ||||
| @cython.freelist(64) | ||||
| cdef class Token: | ||||
|     """An individual token. | ||||
| 
 | ||||
|     Internally, the Token is a tuple (i, tokens) --- it delegates to the Tokens | ||||
|     object. | ||||
|     """ | ||||
|     def __init__(self, Tokens tokens, int i): | ||||
|         self._seq = tokens | ||||
|         self.i = i | ||||
|  | @ -163,21 +173,44 @@ cdef class Token: | |||
|             return self.string + ' ' | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         """The number of unicode code-points in the original string. | ||||
| 
 | ||||
|         Returns: | ||||
|             length (int): | ||||
|         """ | ||||
|         return self._seq.data[self.i].lex.length | ||||
| 
 | ||||
|     property idx: | ||||
|         """The index into the original string at which the token starts. | ||||
| 
 | ||||
|         The following is supposed to always be true: | ||||
|          | ||||
|         >>> original_string[token.idx:token.idx len(token) == token.string | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             return self._seq.data[self.i].idx | ||||
| 
 | ||||
|     property length: | ||||
|         def __get__(self): | ||||
|             return self._seq.data[self.i].lex.length | ||||
| 
 | ||||
|     property cluster: | ||||
|         """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering | ||||
|      | ||||
|         Similar words have better-than-chance likelihood of having similar cluster | ||||
|         IDs, although the clustering is quite noisy.  Cluster IDs make good features, | ||||
|         and help to make models slightly more robust to domain variation. | ||||
| 
 | ||||
|         A common trick is to use only the first N bits of a cluster ID in a feature, | ||||
|         as the more general part of the hierarchical clustering is often more accurate | ||||
|         than the lower categories. | ||||
| 
 | ||||
|         To assist in this, I encode the cluster IDs little-endian, to allow a simple | ||||
|         bit-mask: | ||||
| 
 | ||||
|         >>> six_bits = cluster & (2**6 - 1) | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             return self._seq.data[self.i].lex.cluster | ||||
| 
 | ||||
|     property string: | ||||
|         """The unicode string of the word, with no whitespace padding.""" | ||||
|         def __get__(self): | ||||
|             cdef const TokenC* t = &self._seq.data[self.i] | ||||
|             if t.lex.sic == 0: | ||||
|  | @ -186,6 +219,9 @@ cdef class Token: | |||
|             return utf8string.decode('utf8') | ||||
| 
 | ||||
|     property lemma: | ||||
|         """The unicode string of the word's lemma.  If no part-of-speech tag is | ||||
|         assigned, the most common part-of-speech tag of the word is used. | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             cdef const TokenC* t = &self._seq.data[self.i] | ||||
|             if t.lemma == 0: | ||||
|  | @ -193,15 +229,27 @@ cdef class Token: | |||
|             cdef bytes utf8string = self._seq.vocab.strings[t.lemma] | ||||
|             return utf8string.decode('utf8') | ||||
| 
 | ||||
|     property dep: | ||||
|     property dep_tag: | ||||
|         """The ID integer of the word's dependency label.  If no parse has been | ||||
|         assigned, defaults to 0. | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             return self._seq.data[self.i].dep_tag | ||||
| 
 | ||||
|     property pos: | ||||
|         """The ID integer of the word's part-of-speech tag, from the 13-tag | ||||
|         Google Universal Tag Set.  Constants for this tag set are available in | ||||
|         spacy.typedefs. | ||||
|         """ | ||||
|         def __get__(self): | ||||
|             return self._seq.data[self.i].pos | ||||
| 
 | ||||
|     property fine_pos: | ||||
|         """The ID integer of the word's fine-grained part-of-speech tag, as assigned | ||||
|         by the tagger model.  Fine-grained tags include morphological information, | ||||
|         and other distinctions, and allow a more accurate tagger to be trained. | ||||
|         """ | ||||
|   | ||||
|         def __get__(self): | ||||
|             return self._seq.data[self.i].fine_pos | ||||
| 
 | ||||
|  | @ -210,6 +258,7 @@ cdef class Token: | |||
|             return self._seq.data[self.i].lex.sic | ||||
| 
 | ||||
|     property head: | ||||
|         """The token predicted by the parser to be the head of the current token.""" | ||||
|         def __get__(self): | ||||
|             cdef const TokenC* t = &self._seq.data[self.i] | ||||
|             return Token(self._seq, self.i + t.head) | ||||
|  |  | |||
|  | @ -33,8 +33,6 @@ cpdef Lexeme init_lexeme(id_t i, unicode string, hash_t hashed, | |||
| 
 | ||||
| cdef class Vocab: | ||||
|     '''A map container for a language's Lexeme structs. | ||||
|      | ||||
|     Also interns UTF-8 strings, and maps them to consecutive integer IDs. | ||||
|     ''' | ||||
|     def __init__(self, data_dir=None, get_lex_props=None): | ||||
|         self.mem = Pool() | ||||
|  | @ -53,6 +51,7 @@ cdef class Vocab: | |||
|             self.load(path.join(data_dir, 'lexemes')) | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         """The current number of lexemes stored.""" | ||||
|         return self.lexemes.size() | ||||
| 
 | ||||
|     cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL: | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user