spaCy/website/api/_cython/_tokenc.jade
Ines Montani 968f6f0bda
💫 Document Cython API (#2433)
## Description

This PR adds the most relevant documentation of spaCy's Cython API.

(Todo for when we publish this: rewrite `/api/#section-cython` and `/api/#cython` to `/api/cython#conventions`.)

### Types of change
docs

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-06-11 17:47:46 +02:00

271 lines
7.7 KiB
Plaintext

//- 💫 DOCS > API > CYTHON > STRUCTS > TOKENC
p
| Cython data container for the #[code Token] object.
+aside-code("Example").
token = &doc.c[3]
token_ptr = &doc.c[3]
+table(["Name", "Type", "Description"])
+row
+cell #[code lex]
+cell #[code const LexemeC*]
+cell A pointer to the lexeme for the token.
+row
+cell #[code morph]
+cell #[code uint64_t]
+cell An ID allowing lookup of morphological attributes.
+row
+cell #[code pos]
+cell #[code univ_pos_t]
+cell Coarse-grained part-of-speech tag.
+row
+cell #[code spacy]
+cell #[code bint]
+cell A binary value indicating whether the token has trailing whitespace.
+row
+cell #[code tag]
+cell #[+abbr("uint64_t") #[code attr_t]]
+cell Fine-grained part-of-speech tag.
+row
+cell #[code idx]
+cell #[code int]
+cell The character offset of the token within the parent document.
+row
+cell #[code lemma]
+cell #[+abbr("uint64_t") #[code attr_t]]
+cell Base form of the token, with no inflectional suffixes.
+row
+cell #[code sense]
+cell #[+abbr("uint64_t") #[code attr_t]]
+cell Space for storing a word sense ID, currently unused.
+row
+cell #[code head]
+cell #[code int]
+cell Offset of the syntactic parent relative to the token.
+row
+cell #[code dep]
+cell #[+abbr("uint64_t") #[code attr_t]]
+cell Syntactic dependency relation.
+row
+cell #[code l_kids]
+cell #[code uint32_t]
+cell Number of left children.
+row
+cell #[code r_kids]
+cell #[code uint32_t]
+cell Number of right children.
+row
+cell #[code l_edge]
+cell #[code uint32_t]
+cell Offset of the leftmost token of this token's syntactic descendents.
+row
+cell #[code r_edge]
+cell #[code uint32_t]
+cell Offset of the rightmost token of this token's syntactic descendents.
+row
+cell #[code sent_start]
+cell #[code int]
+cell
| Ternary value indicating whether the token is the first word of
| a sentence. #[code 0] indicates a missing value, #[code -1]
| indicates #[code False] and #[code 1] indicates #[code True]. The default value, 0,
| is interpretted as no sentence break. Sentence boundary detectors will usually
| set 0 for all tokens except tokens that follow a sentence boundary.
+row
+cell #[code ent_iob]
+cell #[code int]
+cell
| IOB code of named entity tag. #[code 0] indicates a missing
| value, #[code 1] indicates #[code I], #[code 2] indicates
| #[code 0] and #[code 3] indicates #[code B].
+row
+cell #[code ent_type]
+cell #[+abbr("uint64_t") #[code attr_t]]
+cell Named entity type.
+row
+cell #[code ent_id]
+cell #[+abbr("uint64_t") #[code hash_t]]
+cell
| ID of the entity the token is an instance of, if any. Currently
| not used, but potentially for coreference resolution.
+h(3, "token_get_struct_attr", "spacy/tokens/token.pxd") Token.get_struct_attr
+tag staticmethod
+tag nogil
p Get the value of an attribute from the #[code TokenC] struct by attribute ID.
+aside-code("Example").
from spacy.attrs cimport IS_ALPHA
from spacy.tokens cimport Token
is_alpha = Token.get_struct_attr(&doc.c[3], IS_ALPHA)
+table(["Name", "Type", "Description"])
+row
+cell #[code token]
+cell #[code const TokenC*]
+cell A pointer to a #[code TokenC] struct.
+row
+cell #[code feat_name]
+cell #[code attr_id_t]
+cell
| The ID of the attribute to look up. The attributes are
| enumerated in #[code spacy.typedefs].
+row("foot")
+cell returns
+cell #[+abbr("uint64_t") #[code attr_t]]
+cell The value of the attribute.
+h(3, "token_set_struct_attr", "spacy/tokens/token.pxd") Token.set_struct_attr
+tag staticmethod
+tag nogil
p Set the value of an attribute of the #[code TokenC] struct by attribute ID.
+aside-code("Example").
from spacy.attrs cimport TAG
from spacy.tokens cimport Token
token = &doc.c[3]
Token.set_struct_attr(token, TAG, 0)
+table(["Name", "Type", "Description"])
+row
+cell #[code token]
+cell #[code const TokenC*]
+cell A pointer to a #[code TokenC] struct.
+row
+cell #[code feat_name]
+cell #[code attr_id_t]
+cell
| The ID of the attribute to look up. The attributes are
| enumerated in #[code spacy.typedefs].
+row
+cell #[code value]
+cell #[+abbr("uint64_t") #[code attr_t]]
+cell The value to set.
+h(3, "token_by_start", "spacy/tokens/doc.pxd") token_by_start
+tag function
p Find a token in a #[code TokenC*] array by the offset of its first character.
+aside-code("Example").
from spacy.tokens.doc cimport Doc, token_by_start
from spacy.vocab cimport Vocab
doc = Doc(Vocab(), words=[u'hello', u'world'])
assert token_by_start(doc.c, doc.length, 6) == 1
assert token_by_start(doc.c, doc.length, 4) == -1
+table(["Name", "Type", "Description"])
+row
+cell #[code tokens]
+cell #[code const TokenC*]
+cell A #[code TokenC*] array.
+row
+cell #[code length]
+cell #[code int]
+cell The number of tokens in the array.
+row
+cell #[code start_char]
+cell #[code int]
+cell The start index to search for.
+row("foot")
+cell returns
+cell #[code int]
+cell The index of the token in the array or #[code -1] if not found.
+h(3, "token_by_end", "spacy/tokens/doc.pxd") token_by_end
+tag function
p Find a token in a #[code TokenC*] array by the offset of its final character.
+aside-code("Example").
from spacy.tokens.doc cimport Doc, token_by_end
from spacy.vocab cimport Vocab
doc = Doc(Vocab(), words=[u'hello', u'world'])
assert token_by_end(doc.c, doc.length, 5) == 0
assert token_by_end(doc.c, doc.length, 1) == -1
+table(["Name", "Type", "Description"])
+row
+cell #[code tokens]
+cell #[code const TokenC*]
+cell A #[code TokenC*] array.
+row
+cell #[code length]
+cell #[code int]
+cell The number of tokens in the array.
+row
+cell #[code end_char]
+cell #[code int]
+cell The end index to search for.
+row("foot")
+cell returns
+cell #[code int]
+cell The index of the token in the array or #[code -1] if not found.
+h(3, "set_children_from_heads", "spacy/tokens/doc.pxd") set_children_from_heads
+tag function
p
| Set attributes that allow lookup of syntactic children on a
| #[code TokenC*] array. This function must be called after making changes
| to the #[code TokenC.head] attribute, in order to make the parse tree
| navigation consistent.
+aside-code("Example").
from spacy.tokens.doc cimport Doc, set_children_from_heads
from spacy.vocab cimport Vocab
doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe'])
doc.c[0].head = 0
doc.c[1].head = 0
doc.c[2].head = 3
doc.c[3].head = 1
set_children_from_heads(doc.c, doc.length)
assert doc.c[3].l_kids == 1
+table(["Name", "Type", "Description"])
+row
+cell #[code tokens]
+cell #[code const TokenC*]
+cell A #[code TokenC*] array.
+row
+cell #[code length]
+cell #[code int]
+cell The number of tokens in the array.