//- 💫 DOCS > API > GOLDPARSE include ../_includes/_mixins p Collection for training annotations. +h(2, "init") GoldParse.__init__ +tag method p Create a #[code GoldParse]. +table(["Name", "Type", "Description"]) +row +cell #[code doc] +cell #[code Doc] +cell The document the annotations refer to. +row +cell #[code words] +cell iterable +cell A sequence of unicode word strings. +row +cell #[code tags] +cell iterable +cell A sequence of strings, representing tag annotations. +row +cell #[code heads] +cell iterable +cell A sequence of integers, representing syntactic head offsets. +row +cell #[code deps] +cell iterable +cell A sequence of strings, representing the syntactic relation types. +row +cell #[code entities] +cell iterable +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions. +row("foot") +cell returns +cell #[code GoldParse] +cell The newly constructed object. +h(2, "len") GoldParse.__len__ +tag method p Get the number of gold-standard tokens. +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell int +cell The number of gold-standard tokens. +h(2, "is_projective") GoldParse.is_projective +tag property p | Whether the provided syntactic annotations form a projective dependency | tree. +table(["Name", "Type", "Description"]) +row("foot") +cell returns +cell bool +cell Whether annotations form projective tree. +h(2, "attributes") Attributes +table(["Name", "Type", "Description"]) +row +cell #[code tags] +cell list +cell The part-of-speech tag annotations. +row +cell #[code heads] +cell list +cell The syntactic head annotations. +row +cell #[code labels] +cell list +cell The syntactic relation-type annotations. +row +cell #[code ents] +cell list +cell The named entity annotations. +row +cell #[code cand_to_gold] +cell list +cell The alignment from candidate tokenization to gold tokenization. +row +cell #[code gold_to_cand] +cell list +cell The alignment from gold tokenization to candidate tokenization. +row +cell #[code cats] #[+tag-new(2)] +cell list +cell | Entries in the list should be either a label, or a | #[code (start, end, label)] triple. The tuple form is used for | categories applied to spans of the document. +h(2, "util") Utilities +h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets +tag function p | Encode labelled spans into per-token tags, using the | #[+a("/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out). p | Returns a list of unicode strings, describing the tags. Each tag string | will be of the form of either #[code ""], #[code "O"] or | #[code "{action}-{label}"], where action is one of #[code "B"], | #[code "I"], #[code "L"], #[code "U"]. The string #[code "-"] | is used where the entity offsets don't align with the tokenization in the | #[code Doc] object. The training algorithm will view these as missing | values. #[code O] denotes a non-entity token. #[code B] denotes the | beginning of a multi-token entity, #[code I] the inside of an entity | of three or more tokens, and #[code L] the end of an entity of two or | more tokens. #[code U] denotes a single-token entity. +aside-code("Example"). from spacy.gold import biluo_tags_from_offsets doc = nlp('I like London.') entities = [(7, 13, 'LOC')] tags = biluo_tags_from_offsets(doc, entities) assert tags == ['O', 'O', 'U-LOC', 'O'] +table(["Name", "Type", "Description"]) +row +cell #[code doc] +cell #[code Doc] +cell | The document that the entity offsets refer to. The output tags | will refer to the token boundaries within the document. +row +cell #[code entities] +cell iterable +cell | A sequence of #[code (start, end, label)] triples. #[code start] | and #[code end] should be character-offset integers denoting the | slice into the original string. +row("foot") +cell returns +cell list +cell | Unicode strings, describing the | #[+a("/api/annotation#biluo") BILUO] tags.