spaCy/website/api/_cython/_tokenc.jade

//- 💫 DOCS > API > CYTHON > STRUCTS > TOKENC

p
    |  Cython data container for the #[code Token] object.

+aside-code("Example").
    token = &doc.c[3]
    token_ptr = &doc.c[3]

+table(["Name", "Type", "Description"])
    +row
        +cell #[code lex]
        +cell #[code const LexemeC*]
        +cell A pointer to the lexeme for the token.

    +row
        +cell #[code morph]
        +cell #[code uint64_t]
        +cell An ID allowing lookup of morphological attributes.

    +row
        +cell #[code pos]
        +cell #[code univ_pos_t]
        +cell Coarse-grained part-of-speech tag.

    +row
        +cell #[code spacy]
        +cell #[code bint]
        +cell A binary value indicating whether the token has trailing whitespace.

    +row
        +cell #[code tag]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell Fine-grained part-of-speech tag.

    +row
        +cell #[code idx]
        +cell #[code int]
        +cell The character offset of the token within the parent document.

    +row
        +cell #[code lemma]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell Base form of the token, with no inflectional suffixes.

    +row
        +cell #[code sense]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell Space for storing a word sense ID, currently unused.

    +row
        +cell #[code head]
        +cell #[code int]
        +cell Offset of the syntactic parent relative to the token.

    +row
        +cell #[code dep]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell Syntactic dependency relation.

    +row
        +cell #[code l_kids]
        +cell #[code uint32_t]
        +cell Number of left children.

    +row
        +cell #[code r_kids]
        +cell #[code uint32_t]
        +cell Number of right children.

    +row
        +cell #[code l_edge]
        +cell #[code uint32_t]
        +cell Offset of the leftmost token of this token's syntactic descendents.

    +row
        +cell #[code r_edge]
        +cell #[code uint32_t]
        +cell Offset of the rightmost token of this token's syntactic descendents.

    +row
        +cell #[code sent_start]
        +cell #[code int]
        +cell
            |  Ternary value indicating whether the token is the first word of
            |  a sentence. #[code 0] indicates a missing value, #[code -1]
            |  indicates #[code False] and #[code 1] indicates #[code True]. The default value, 0,
            |  is interpretted as no sentence break. Sentence boundary detectors will usually
            |  set 0 for all tokens except tokens that follow a sentence boundary.

    +row
        +cell #[code ent_iob]
        +cell #[code int]
        +cell
            |  IOB code of named entity tag. #[code 0] indicates a missing
            |  value, #[code 1] indicates #[code I], #[code 2] indicates
            |  #[code 0] and #[code 3] indicates #[code B].

    +row
        +cell #[code ent_type]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell Named entity type.

    +row
        +cell #[code ent_id]
        +cell #[+abbr("uint64_t") #[code hash_t]]
        +cell
            |  ID of the entity the token is an instance of, if any. Currently
            |  not used, but potentially for coreference resolution.

+h(3, "token_get_struct_attr", "spacy/tokens/token.pxd") Token.get_struct_attr
    +tag staticmethod
    +tag nogil

p Get the value of an attribute from the #[code TokenC] struct by attribute ID.

+aside-code("Example").
    from spacy.attrs cimport IS_ALPHA
    from spacy.tokens cimport Token

    is_alpha = Token.get_struct_attr(&doc.c[3], IS_ALPHA)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code token]
        +cell #[code const TokenC*]
        +cell A pointer to a #[code TokenC] struct.

    +row
        +cell #[code feat_name]
        +cell #[code attr_id_t]
        +cell
            |  The ID of the attribute to look up. The attributes are
            |  enumerated in #[code spacy.typedefs].

    +row("foot")
        +cell returns
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell The value of the attribute.

+h(3, "token_set_struct_attr", "spacy/tokens/token.pxd") Token.set_struct_attr
    +tag staticmethod
    +tag nogil

p Set the value of an attribute of the #[code TokenC] struct by attribute ID.

+aside-code("Example").
    from spacy.attrs cimport TAG
    from spacy.tokens cimport Token

    token = &doc.c[3]
    Token.set_struct_attr(token, TAG, 0)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code token]
        +cell #[code const TokenC*]
        +cell A pointer to a #[code TokenC] struct.

    +row
        +cell #[code feat_name]
        +cell #[code attr_id_t]
        +cell
            |  The ID of the attribute to look up. The attributes are
            |  enumerated in #[code spacy.typedefs].

    +row
        +cell #[code value]
        +cell #[+abbr("uint64_t") #[code attr_t]]
        +cell The value to set.

+h(3, "token_by_start", "spacy/tokens/doc.pxd") token_by_start
    +tag function

p Find a token in a #[code TokenC*] array by the offset of its first character.

+aside-code("Example").
    from spacy.tokens.doc cimport Doc, token_by_start
    from spacy.vocab cimport Vocab

    doc = Doc(Vocab(), words=[u'hello', u'world'])
    assert token_by_start(doc.c, doc.length, 6) == 1
    assert token_by_start(doc.c, doc.length, 4) == -1

+table(["Name", "Type", "Description"])
    +row
        +cell #[code tokens]
        +cell #[code const TokenC*]
        +cell A #[code TokenC*] array.

    +row
        +cell #[code length]
        +cell #[code int]
        +cell The number of tokens in the array.

    +row
        +cell #[code start_char]
        +cell #[code int]
        +cell The start index to search for.

    +row("foot")
        +cell returns
        +cell #[code int]
        +cell The index of the token in the array or #[code -1] if not found.

+h(3, "token_by_end", "spacy/tokens/doc.pxd") token_by_end
    +tag function

p Find a token in a #[code TokenC*] array by the offset of its final character.

+aside-code("Example").
    from spacy.tokens.doc cimport Doc, token_by_end
    from spacy.vocab cimport Vocab

    doc = Doc(Vocab(), words=[u'hello', u'world'])
    assert token_by_end(doc.c, doc.length, 5) == 0
    assert token_by_end(doc.c, doc.length, 1) == -1

+table(["Name", "Type", "Description"])
    +row
        +cell #[code tokens]
        +cell #[code const TokenC*]
        +cell A #[code TokenC*] array.

    +row
        +cell #[code length]
        +cell #[code int]
        +cell The number of tokens in the array.

    +row
        +cell #[code end_char]
        +cell #[code int]
        +cell The end index to search for.

    +row("foot")
        +cell returns
        +cell #[code int]
        +cell The index of the token in the array or #[code -1] if not found.

+h(3, "set_children_from_heads", "spacy/tokens/doc.pxd") set_children_from_heads
    +tag function

p
    |  Set attributes that allow lookup of syntactic children on a
    |  #[code TokenC*] array. This function must be called after making changes
    |  to the #[code TokenC.head] attribute, in order to make the parse tree
    |  navigation consistent.

+aside-code("Example").
    from spacy.tokens.doc cimport Doc, set_children_from_heads
    from spacy.vocab cimport Vocab

    doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe'])
    doc.c[0].head = 0
    doc.c[1].head = 0
    doc.c[2].head = 3
    doc.c[3].head = 1
    set_children_from_heads(doc.c, doc.length)
    assert doc.c[3].l_kids == 1

+table(["Name", "Type", "Description"])
    +row
        +cell #[code tokens]
        +cell #[code const TokenC*]
        +cell A #[code TokenC*] array.

    +row
        +cell #[code length]
        +cell #[code int]
        +cell The number of tokens in the array.