mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
968f6f0bda
## Description This PR adds the most relevant documentation of spaCy's Cython API. (Todo for when we publish this: rewrite `/api/#section-cython` and `/api/#cython` to `/api/cython#conventions`.) ### Types of change docs ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
271 lines
7.7 KiB
Plaintext
271 lines
7.7 KiB
Plaintext
//- 💫 DOCS > API > CYTHON > STRUCTS > TOKENC
|
|
|
|
p
|
|
| Cython data container for the #[code Token] object.
|
|
|
|
+aside-code("Example").
|
|
token = &doc.c[3]
|
|
token_ptr = &doc.c[3]
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code lex]
|
|
+cell #[code const LexemeC*]
|
|
+cell A pointer to the lexeme for the token.
|
|
|
|
+row
|
|
+cell #[code morph]
|
|
+cell #[code uint64_t]
|
|
+cell An ID allowing lookup of morphological attributes.
|
|
|
|
+row
|
|
+cell #[code pos]
|
|
+cell #[code univ_pos_t]
|
|
+cell Coarse-grained part-of-speech tag.
|
|
|
|
+row
|
|
+cell #[code spacy]
|
|
+cell #[code bint]
|
|
+cell A binary value indicating whether the token has trailing whitespace.
|
|
|
|
+row
|
|
+cell #[code tag]
|
|
+cell #[+abbr("uint64_t") #[code attr_t]]
|
|
+cell Fine-grained part-of-speech tag.
|
|
|
|
+row
|
|
+cell #[code idx]
|
|
+cell #[code int]
|
|
+cell The character offset of the token within the parent document.
|
|
|
|
+row
|
|
+cell #[code lemma]
|
|
+cell #[+abbr("uint64_t") #[code attr_t]]
|
|
+cell Base form of the token, with no inflectional suffixes.
|
|
|
|
+row
|
|
+cell #[code sense]
|
|
+cell #[+abbr("uint64_t") #[code attr_t]]
|
|
+cell Space for storing a word sense ID, currently unused.
|
|
|
|
+row
|
|
+cell #[code head]
|
|
+cell #[code int]
|
|
+cell Offset of the syntactic parent relative to the token.
|
|
|
|
+row
|
|
+cell #[code dep]
|
|
+cell #[+abbr("uint64_t") #[code attr_t]]
|
|
+cell Syntactic dependency relation.
|
|
|
|
+row
|
|
+cell #[code l_kids]
|
|
+cell #[code uint32_t]
|
|
+cell Number of left children.
|
|
|
|
+row
|
|
+cell #[code r_kids]
|
|
+cell #[code uint32_t]
|
|
+cell Number of right children.
|
|
|
|
+row
|
|
+cell #[code l_edge]
|
|
+cell #[code uint32_t]
|
|
+cell Offset of the leftmost token of this token's syntactic descendents.
|
|
|
|
+row
|
|
+cell #[code r_edge]
|
|
+cell #[code uint32_t]
|
|
+cell Offset of the rightmost token of this token's syntactic descendents.
|
|
|
|
+row
|
|
+cell #[code sent_start]
|
|
+cell #[code int]
|
|
+cell
|
|
| Ternary value indicating whether the token is the first word of
|
|
| a sentence. #[code 0] indicates a missing value, #[code -1]
|
|
| indicates #[code False] and #[code 1] indicates #[code True]. The default value, 0,
|
|
| is interpretted as no sentence break. Sentence boundary detectors will usually
|
|
| set 0 for all tokens except tokens that follow a sentence boundary.
|
|
|
|
+row
|
|
+cell #[code ent_iob]
|
|
+cell #[code int]
|
|
+cell
|
|
| IOB code of named entity tag. #[code 0] indicates a missing
|
|
| value, #[code 1] indicates #[code I], #[code 2] indicates
|
|
| #[code 0] and #[code 3] indicates #[code B].
|
|
|
|
+row
|
|
+cell #[code ent_type]
|
|
+cell #[+abbr("uint64_t") #[code attr_t]]
|
|
+cell Named entity type.
|
|
|
|
+row
|
|
+cell #[code ent_id]
|
|
+cell #[+abbr("uint64_t") #[code hash_t]]
|
|
+cell
|
|
| ID of the entity the token is an instance of, if any. Currently
|
|
| not used, but potentially for coreference resolution.
|
|
|
|
+h(3, "token_get_struct_attr", "spacy/tokens/token.pxd") Token.get_struct_attr
|
|
+tag staticmethod
|
|
+tag nogil
|
|
|
|
p Get the value of an attribute from the #[code TokenC] struct by attribute ID.
|
|
|
|
+aside-code("Example").
|
|
from spacy.attrs cimport IS_ALPHA
|
|
from spacy.tokens cimport Token
|
|
|
|
is_alpha = Token.get_struct_attr(&doc.c[3], IS_ALPHA)
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code token]
|
|
+cell #[code const TokenC*]
|
|
+cell A pointer to a #[code TokenC] struct.
|
|
|
|
+row
|
|
+cell #[code feat_name]
|
|
+cell #[code attr_id_t]
|
|
+cell
|
|
| The ID of the attribute to look up. The attributes are
|
|
| enumerated in #[code spacy.typedefs].
|
|
|
|
+row("foot")
|
|
+cell returns
|
|
+cell #[+abbr("uint64_t") #[code attr_t]]
|
|
+cell The value of the attribute.
|
|
|
|
+h(3, "token_set_struct_attr", "spacy/tokens/token.pxd") Token.set_struct_attr
|
|
+tag staticmethod
|
|
+tag nogil
|
|
|
|
p Set the value of an attribute of the #[code TokenC] struct by attribute ID.
|
|
|
|
+aside-code("Example").
|
|
from spacy.attrs cimport TAG
|
|
from spacy.tokens cimport Token
|
|
|
|
token = &doc.c[3]
|
|
Token.set_struct_attr(token, TAG, 0)
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code token]
|
|
+cell #[code const TokenC*]
|
|
+cell A pointer to a #[code TokenC] struct.
|
|
|
|
+row
|
|
+cell #[code feat_name]
|
|
+cell #[code attr_id_t]
|
|
+cell
|
|
| The ID of the attribute to look up. The attributes are
|
|
| enumerated in #[code spacy.typedefs].
|
|
|
|
+row
|
|
+cell #[code value]
|
|
+cell #[+abbr("uint64_t") #[code attr_t]]
|
|
+cell The value to set.
|
|
|
|
+h(3, "token_by_start", "spacy/tokens/doc.pxd") token_by_start
|
|
+tag function
|
|
|
|
p Find a token in a #[code TokenC*] array by the offset of its first character.
|
|
|
|
+aside-code("Example").
|
|
from spacy.tokens.doc cimport Doc, token_by_start
|
|
from spacy.vocab cimport Vocab
|
|
|
|
doc = Doc(Vocab(), words=[u'hello', u'world'])
|
|
assert token_by_start(doc.c, doc.length, 6) == 1
|
|
assert token_by_start(doc.c, doc.length, 4) == -1
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code tokens]
|
|
+cell #[code const TokenC*]
|
|
+cell A #[code TokenC*] array.
|
|
|
|
+row
|
|
+cell #[code length]
|
|
+cell #[code int]
|
|
+cell The number of tokens in the array.
|
|
|
|
+row
|
|
+cell #[code start_char]
|
|
+cell #[code int]
|
|
+cell The start index to search for.
|
|
|
|
+row("foot")
|
|
+cell returns
|
|
+cell #[code int]
|
|
+cell The index of the token in the array or #[code -1] if not found.
|
|
|
|
+h(3, "token_by_end", "spacy/tokens/doc.pxd") token_by_end
|
|
+tag function
|
|
|
|
p Find a token in a #[code TokenC*] array by the offset of its final character.
|
|
|
|
+aside-code("Example").
|
|
from spacy.tokens.doc cimport Doc, token_by_end
|
|
from spacy.vocab cimport Vocab
|
|
|
|
doc = Doc(Vocab(), words=[u'hello', u'world'])
|
|
assert token_by_end(doc.c, doc.length, 5) == 0
|
|
assert token_by_end(doc.c, doc.length, 1) == -1
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code tokens]
|
|
+cell #[code const TokenC*]
|
|
+cell A #[code TokenC*] array.
|
|
|
|
+row
|
|
+cell #[code length]
|
|
+cell #[code int]
|
|
+cell The number of tokens in the array.
|
|
|
|
+row
|
|
+cell #[code end_char]
|
|
+cell #[code int]
|
|
+cell The end index to search for.
|
|
|
|
+row("foot")
|
|
+cell returns
|
|
+cell #[code int]
|
|
+cell The index of the token in the array or #[code -1] if not found.
|
|
|
|
+h(3, "set_children_from_heads", "spacy/tokens/doc.pxd") set_children_from_heads
|
|
+tag function
|
|
|
|
p
|
|
| Set attributes that allow lookup of syntactic children on a
|
|
| #[code TokenC*] array. This function must be called after making changes
|
|
| to the #[code TokenC.head] attribute, in order to make the parse tree
|
|
| navigation consistent.
|
|
|
|
+aside-code("Example").
|
|
from spacy.tokens.doc cimport Doc, set_children_from_heads
|
|
from spacy.vocab cimport Vocab
|
|
|
|
doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe'])
|
|
doc.c[0].head = 0
|
|
doc.c[1].head = 0
|
|
doc.c[2].head = 3
|
|
doc.c[3].head = 1
|
|
set_children_from_heads(doc.c, doc.length)
|
|
assert doc.c[3].l_kids == 1
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code tokens]
|
|
+cell #[code const TokenC*]
|
|
+cell A #[code TokenC*] array.
|
|
|
|
+row
|
|
+cell #[code length]
|
|
+cell #[code int]
|
|
+cell The number of tokens in the array.
|