mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			204 lines
		
	
	
		
			5.9 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			204 lines
		
	
	
		
			5.9 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > API > GOLDPARSE
 | |
| 
 | |
| include ../_includes/_mixins
 | |
| 
 | |
| p Collection for training annotations.
 | |
| 
 | |
| +h(2, "init") GoldParse.__init__
 | |
|     +tag method
 | |
| 
 | |
| p Create a #[code GoldParse].
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code doc]
 | |
|         +cell #[code Doc]
 | |
|         +cell The document the annotations refer to.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code words]
 | |
|         +cell iterable
 | |
|         +cell A sequence of unicode word strings.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code tags]
 | |
|         +cell iterable
 | |
|         +cell A sequence of strings, representing tag annotations.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code heads]
 | |
|         +cell iterable
 | |
|         +cell A sequence of integers, representing syntactic head offsets.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code deps]
 | |
|         +cell iterable
 | |
|         +cell A sequence of strings, representing the syntactic relation types.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code entities]
 | |
|         +cell iterable
 | |
|         +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
 | |
| 
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell #[code GoldParse]
 | |
|         +cell The newly constructed object.
 | |
| 
 | |
| +h(2, "len") GoldParse.__len__
 | |
|     +tag method
 | |
| 
 | |
| p Get the number of gold-standard tokens.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell int
 | |
|         +cell The number of gold-standard tokens.
 | |
| 
 | |
| +h(2, "is_projective") GoldParse.is_projective
 | |
|     +tag property
 | |
| 
 | |
| p
 | |
|     |  Whether the provided syntactic annotations form a projective dependency
 | |
|     |  tree.
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell bool
 | |
|         +cell Whether annotations form projective tree.
 | |
| 
 | |
| 
 | |
| +h(2, "attributes") Attributes
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code tags]
 | |
|         +cell list
 | |
|         +cell The part-of-speech tag annotations.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code heads]
 | |
|         +cell list
 | |
|         +cell The syntactic head annotations.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code labels]
 | |
|         +cell list
 | |
|         +cell The syntactic relation-type annotations.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code ents]
 | |
|         +cell list
 | |
|         +cell The named entity annotations.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code cand_to_gold]
 | |
|         +cell list
 | |
|         +cell The alignment from candidate tokenization to gold tokenization.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code gold_to_cand]
 | |
|         +cell list
 | |
|         +cell The alignment from gold tokenization to candidate tokenization.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code cats] #[+tag-new(2)]
 | |
|         +cell list
 | |
|         +cell
 | |
|             |  Entries in the list should be either a label, or a
 | |
|             |  #[code (start, end, label)] triple. The tuple form is used for
 | |
|             |  categories applied to spans of the document.
 | |
| 
 | |
| 
 | |
| +h(2, "util") Utilities
 | |
| 
 | |
| +h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
 | |
|     +tag function
 | |
| 
 | |
| p
 | |
|     |  Encode labelled spans into per-token tags, using the
 | |
|     |  #[+a("/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
 | |
| 
 | |
| p
 | |
|     |  Returns a list of unicode strings, describing the tags. Each tag string
 | |
|     |  will be of the form of either #[code ""], #[code "O"] or
 | |
|     |  #[code "{action}-{label}"], where action is one of #[code "B"],
 | |
|     |  #[code "I"], #[code "L"], #[code "U"]. The string #[code "-"]
 | |
|     |  is used where the entity offsets don't align with the tokenization in the
 | |
|     |  #[code Doc] object. The training algorithm will view these as missing
 | |
|     |  values. #[code O] denotes a non-entity token. #[code B] denotes the
 | |
|     |  beginning of a multi-token entity, #[code I] the inside of an entity
 | |
|     |  of three or more tokens, and #[code L] the end of an entity of two or
 | |
|     |  more tokens. #[code U] denotes a single-token entity.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.gold import biluo_tags_from_offsets
 | |
| 
 | |
|     doc = nlp(u'I like London.')
 | |
|     entities = [(7, 13, 'LOC')]
 | |
|     tags = biluo_tags_from_offsets(doc, entities)
 | |
|     assert tags == ['O', 'O', 'U-LOC', 'O']
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code doc]
 | |
|         +cell #[code Doc]
 | |
|         +cell
 | |
|             |  The document that the entity offsets refer to. The output tags
 | |
|             |  will refer to the token boundaries within the document.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code entities]
 | |
|         +cell iterable
 | |
|         +cell
 | |
|             |  A sequence of #[code (start, end, label)] triples. #[code start]
 | |
|             |  and #[code end] should be character-offset integers denoting the
 | |
|             |  slice into the original string.
 | |
| 
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell list
 | |
|         +cell
 | |
|             |  Unicode strings, describing the
 | |
|             |  #[+a("/api/annotation#biluo") BILUO] tags.
 | |
| 
 | |
| +h(3, "offsets_from_biluo_tags") gold.offsets_from_biluo_tags
 | |
| 
 | |
| p
 | |
|     |  Encode per-token tags following the
 | |
|     |  #[+a("/api/annotation#biluo") BILUO scheme] into entity offsets.
 | |
| 
 | |
| +aside-code("Example").
 | |
|     from spacy.gold import offsets_from_biluo_tags
 | |
| 
 | |
|     doc = nlp('I like London.')
 | |
|     tags = ['O', 'O', 'U-LOC', 'O']
 | |
|     entities = offsets_from_biluo_tags(doc, tags)
 | |
|     assert entities == [(7, 13, 'LOC')]
 | |
| 
 | |
| +table(["Name", "Type", "Description"])
 | |
|     +row
 | |
|         +cell #[code doc]
 | |
|         +cell #[code Doc]
 | |
|         +cell The document that the BILUO tags refer to.
 | |
| 
 | |
|     +row
 | |
|         +cell #[code entities]
 | |
|         +cell iterable
 | |
|         +cell
 | |
|             |  A sequence of #[+a("/api/annotation#biluo") BILUO] tags with
 | |
|             |  each tag describing one token. Each tag string will be of the
 | |
|             |  form of either #[code ""], #[code "O"] or
 | |
|             |  #[code "{action}-{label}"], where action is one of #[code "B"],
 | |
|             |  #[code "I"], #[code "L"], #[code "U"].
 | |
| 
 | |
|     +row("foot")
 | |
|         +cell returns
 | |
|         +cell list
 | |
|         +cell
 | |
|             |  A sequence of #[code (start, end, label)] triples. #[code start]
 | |
|             |  and #[code end] will be character-offset integers denoting the
 | |
|             |  slice into the original string.
 |