mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	## Description This PR adds the most relevant documentation of spaCy's Cython API. (Todo for when we publish this: rewrite `/api/#section-cython` and `/api/#cython` to `/api/cython#conventions`.) ### Types of change docs ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
		
			
				
	
	
		
			271 lines
		
	
	
		
			7.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			271 lines
		
	
	
		
			7.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > API > CYTHON > STRUCTS > TOKENC
 | |
| 
 | |
| p
 | |
|     |  Cython data container for the #[code Token] object.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     token = &doc.c[3]
 | |
|     token_ptr = &doc.c[3]
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code lex]
 | |
|         +cell #[code const LexemeC*]
 | |
|         +cell A pointer to the lexeme for the token.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code morph]
 | |
|         +cell #[code uint64_t]
 | |
|         +cell An ID allowing lookup of morphological attributes.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code pos]
 | |
|         +cell #[code univ_pos_t]
 | |
|         +cell Coarse-grained part-of-speech tag.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code spacy]
 | |
|         +cell #[code bint]
 | |
|         +cell A binary value indicating whether the token has trailing whitespace.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code tag]
 | |
|         +cell #[+abbr("uint64_t") #[code attr_t]]
 | |
|         +cell Fine-grained part-of-speech tag.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code idx]
 | |
|         +cell #[code int]
 | |
|         +cell The character offset of the token within the parent document.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code lemma]
 | |
|         +cell #[+abbr("uint64_t") #[code attr_t]]
 | |
|         +cell Base form of the token, with no inflectional suffixes.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code sense]
 | |
|         +cell #[+abbr("uint64_t") #[code attr_t]]
 | |
|         +cell Space for storing a word sense ID, currently unused.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code head]
 | |
|         +cell #[code int]
 | |
|         +cell Offset of the syntactic parent relative to the token.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code dep]
 | |
|         +cell #[+abbr("uint64_t") #[code attr_t]]
 | |
|         +cell Syntactic dependency relation.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code l_kids]
 | |
|         +cell #[code uint32_t]
 | |
|         +cell Number of left children.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code r_kids]
 | |
|         +cell #[code uint32_t]
 | |
|         +cell Number of right children.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code l_edge]
 | |
|         +cell #[code uint32_t]
 | |
|         +cell Offset of the leftmost token of this token's syntactic descendents.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code r_edge]
 | |
|         +cell #[code uint32_t]
 | |
|         +cell Offset of the rightmost token of this token's syntactic descendents.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code sent_start]
 | |
|         +cell #[code int]
 | |
|         +cell
 | |
|             |  Ternary value indicating whether the token is the first word of
 | |
|             |  a sentence. #[code 0] indicates a missing value, #[code -1]
 | |
|             |  indicates #[code False] and #[code 1] indicates #[code True]. The default value, 0,
 | |
|             |  is interpretted as no sentence break. Sentence boundary detectors will usually
 | |
|             |  set 0 for all tokens except tokens that follow a sentence boundary.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code ent_iob]
 | |
|         +cell #[code int]
 | |
|         +cell
 | |
|             |  IOB code of named entity tag. #[code 0] indicates a missing
 | |
|             |  value, #[code 1] indicates #[code I], #[code 2] indicates
 | |
|             |  #[code 0] and #[code 3] indicates #[code B].
 | |
| 
 | |
|     +row
 | |
|         +cell #[code ent_type]
 | |
|         +cell #[+abbr("uint64_t") #[code attr_t]]
 | |
|         +cell Named entity type.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code ent_id]
 | |
|         +cell #[+abbr("uint64_t") #[code hash_t]]
 | |
|         +cell
 | |
|             |  ID of the entity the token is an instance of, if any. Currently
 | |
|             |  not used, but potentially for coreference resolution.
 | |
| 
 | |
| +h(3, "token_get_struct_attr", "spacy/tokens/token.pxd") Token.get_struct_attr
 | |
|     +tag staticmethod
 | |
|     +tag nogil
 | |
| 
 | |
| p Get the value of an attribute from the #[code TokenC] struct by attribute ID.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.attrs cimport IS_ALPHA
 | |
|     from spacy.tokens cimport Token
 | |
| 
 | |
|     is_alpha = Token.get_struct_attr(&doc.c[3], IS_ALPHA)
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code token]
 | |
|         +cell #[code const TokenC*]
 | |
|         +cell A pointer to a #[code TokenC] struct.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code feat_name]
 | |
|         +cell #[code attr_id_t]
 | |
|         +cell
 | |
|             |  The ID of the attribute to look up. The attributes are
 | |
|             |  enumerated in #[code spacy.typedefs].
 | |
| 
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell #[+abbr("uint64_t") #[code attr_t]]
 | |
|         +cell The value of the attribute.
 | |
| 
 | |
| +h(3, "token_set_struct_attr", "spacy/tokens/token.pxd") Token.set_struct_attr
 | |
|     +tag staticmethod
 | |
|     +tag nogil
 | |
| 
 | |
| p Set the value of an attribute of the #[code TokenC] struct by attribute ID.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.attrs cimport TAG
 | |
|     from spacy.tokens cimport Token
 | |
| 
 | |
|     token = &doc.c[3]
 | |
|     Token.set_struct_attr(token, TAG, 0)
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code token]
 | |
|         +cell #[code const TokenC*]
 | |
|         +cell A pointer to a #[code TokenC] struct.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code feat_name]
 | |
|         +cell #[code attr_id_t]
 | |
|         +cell
 | |
|             |  The ID of the attribute to look up. The attributes are
 | |
|             |  enumerated in #[code spacy.typedefs].
 | |
| 
 | |
|     +row
 | |
|         +cell #[code value]
 | |
|         +cell #[+abbr("uint64_t") #[code attr_t]]
 | |
|         +cell The value to set.
 | |
| 
 | |
| +h(3, "token_by_start", "spacy/tokens/doc.pxd") token_by_start
 | |
|     +tag function
 | |
| 
 | |
| p Find a token in a #[code TokenC*] array by the offset of its first character.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.tokens.doc cimport Doc, token_by_start
 | |
|     from spacy.vocab cimport Vocab
 | |
| 
 | |
|     doc = Doc(Vocab(), words=[u'hello', u'world'])
 | |
|     assert token_by_start(doc.c, doc.length, 6) == 1
 | |
|     assert token_by_start(doc.c, doc.length, 4) == -1
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code tokens]
 | |
|         +cell #[code const TokenC*]
 | |
|         +cell A #[code TokenC*] array.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code length]
 | |
|         +cell #[code int]
 | |
|         +cell The number of tokens in the array.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code start_char]
 | |
|         +cell #[code int]
 | |
|         +cell The start index to search for.
 | |
| 
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell #[code int]
 | |
|         +cell The index of the token in the array or #[code -1] if not found.
 | |
| 
 | |
| +h(3, "token_by_end", "spacy/tokens/doc.pxd") token_by_end
 | |
|     +tag function
 | |
| 
 | |
| p Find a token in a #[code TokenC*] array by the offset of its final character.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.tokens.doc cimport Doc, token_by_end
 | |
|     from spacy.vocab cimport Vocab
 | |
| 
 | |
|     doc = Doc(Vocab(), words=[u'hello', u'world'])
 | |
|     assert token_by_end(doc.c, doc.length, 5) == 0
 | |
|     assert token_by_end(doc.c, doc.length, 1) == -1
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code tokens]
 | |
|         +cell #[code const TokenC*]
 | |
|         +cell A #[code TokenC*] array.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code length]
 | |
|         +cell #[code int]
 | |
|         +cell The number of tokens in the array.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code end_char]
 | |
|         +cell #[code int]
 | |
|         +cell The end index to search for.
 | |
| 
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell #[code int]
 | |
|         +cell The index of the token in the array or #[code -1] if not found.
 | |
| 
 | |
| +h(3, "set_children_from_heads", "spacy/tokens/doc.pxd") set_children_from_heads
 | |
|     +tag function
 | |
| 
 | |
| p
 | |
|     |  Set attributes that allow lookup of syntactic children on a
 | |
|     |  #[code TokenC*] array. This function must be called after making changes
 | |
|     |  to the #[code TokenC.head] attribute, in order to make the parse tree
 | |
|     |  navigation consistent.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.tokens.doc cimport Doc, set_children_from_heads
 | |
|     from spacy.vocab cimport Vocab
 | |
| 
 | |
|     doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe'])
 | |
|     doc.c[0].head = 0
 | |
|     doc.c[1].head = 0
 | |
|     doc.c[2].head = 3
 | |
|     doc.c[3].head = 1
 | |
|     set_children_from_heads(doc.c, doc.length)
 | |
|     assert doc.c[3].l_kids == 1
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code tokens]
 | |
|         +cell #[code const TokenC*]
 | |
|         +cell A #[code TokenC*] array.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code length]
 | |
|         +cell #[code int]
 | |
|         +cell The number of tokens in the array.
 |