Update docstrings and API docs for GoldParse

This commit is contained in:
ines 2017-05-21 13:53:46 +02:00
parent 465a1dd710
commit 075f5ff87a
2 changed files with 95 additions and 58 deletions

View File

@ -225,25 +225,17 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False): deps=None, entities=None, make_projective=False):
""" """Create a GoldParse.
Create a GoldParse.
Arguments: doc (Doc): The document the annotations refer to.
doc (Doc): words (iterable): A sequence of unicode word strings.
The document the annotations refer to. tags (iterable): A sequence of strings, representing tag annotations.
words: heads (iterable): A sequence of integers, representing syntactic head offsets.
A sequence of unicode word strings. deps (iterable): A sequence of strings, representing the syntactic relation types.
tags: entities (iterable): A sequence of named entity annotations, either as
A sequence of strings, representing tag annotations. BILUO tag strings, or as `(start_char, end_char, label)` tuples,
heads: representing the entity positions.
A sequence of integers, representing syntactic head offsets. RETURNS (GoldParse): The newly constructed object.
deps:
A sequence of strings, representing the syntactic relation types.
entities:
A sequence of named entity annotations, either as BILUO tag strings,
or as (start_char, end_char, label) tuples, representing the entity
positions.
Returns (GoldParse): The newly constructed object.
""" """
if words is None: if words is None:
words = [token.text for token in doc] words = [token.text for token in doc]
@ -308,55 +300,45 @@ cdef class GoldParse:
self.heads = proj_heads self.heads = proj_heads
def __len__(self): def __len__(self):
""" """Get the number of gold-standard tokens.
Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens. RETURNS (int): The number of gold-standard tokens.
""" """
return self.length return self.length
@property @property
def is_projective(self): def is_projective(self):
""" """Whether the provided syntactic annotations form a projective
Whether the provided syntactic annotations form a projective dependency dependency tree.
tree.
""" """
return not nonproj.is_nonproj_tree(self.heads) return not nonproj.is_nonproj_tree(self.heads)
def biluo_tags_from_offsets(doc, entities): def biluo_tags_from_offsets(doc, entities):
""" """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (BILUO).
scheme (biluo).
Arguments: doc (Doc): The document that the entity offsets refer to. The output tags
doc (Doc): will refer to the token boundaries within the document.
The document that the entity offsets refer to. The output tags will entities (iterable): A sequence of `(start, end, label)` triples. `start` and
refer to the token boundaries within the document. `end` should be character-offset integers denoting the slice into the
original string.
entities (sequence): RETURNS (list): A list of unicode strings, describing the tags. Each tag
A sequence of (start, end, label) triples. start and end should be string will be of the form either "", "O" or "{action}-{label}", where
character-offset integers denoting the slice into the original string. action is one of "B", "I", "L", "U". The string "-" is used where the
entity offsets don't align with the tokenization in the `Doc` object. The
training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Returns: EXAMPLE:
tags (list): >>> text = 'I like London.'
A list of unicode strings, describing the tags. Each tag string will >>> entities = [(len('I like '), len('I like London'), 'LOC')]
be of the form either "", "O" or "{action}-{label}", where action is one >>> doc = nlp.tokenizer(text)
of "B", "I", "L", "U". The string "-" is used where the entity >>> tags = biluo_tags_from_offsets(doc, entities)
offsets don't align with the tokenization in the Doc object. The >>> assert tags == ['O', 'O', 'U-LOC', 'O']
training algorithm will view these as missing values. "O" denotes
a non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Example:
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = nlp.tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
""" """
starts = {token.idx: token.i for token in doc} starts = {token.idx: token.i for token in doc}
ends = {token.idx+len(token): token.i for token in doc} ends = {token.idx+len(token): token.i for token in doc}

View File

@ -17,27 +17,27 @@ p Create a GoldParse.
+row +row
+cell #[code words] +cell #[code words]
+cell - +cell iterable
+cell A sequence of unicode word strings. +cell A sequence of unicode word strings.
+row +row
+cell #[code tags] +cell #[code tags]
+cell - +cell iterable
+cell A sequence of strings, representing tag annotations. +cell A sequence of strings, representing tag annotations.
+row +row
+cell #[code heads] +cell #[code heads]
+cell - +cell iterable
+cell A sequence of integers, representing syntactic head offsets. +cell A sequence of integers, representing syntactic head offsets.
+row +row
+cell #[code deps] +cell #[code deps]
+cell - +cell iterable
+cell A sequence of strings, representing the syntactic relation types. +cell A sequence of strings, representing the syntactic relation types.
+row +row
+cell #[code entities] +cell #[code entities]
+cell - +cell iterable
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions. +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
+footrow +footrow
@ -102,3 +102,58 @@ p
+cell #[code gold_to_cand] +cell #[code gold_to_cand]
+cell list +cell list
+cell The alignment from gold tokenization to candidate tokenization. +cell The alignment from gold tokenization to candidate tokenization.
+h(2, "util") Utilities
+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
+tag function
p
| Encode labelled spans into per-token tags, using the
| #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
p
| Returns a list of unicode strings, describing the tags. Each tag string
| will be of the form either #[code ""], #[code "O"] or
| #[code "{action}-{label}"], where action is one of #[code "B"],
| #[code "I"], #[code "L"], #[code "U"]. The string #[code "-"]
| is used where the entity offsets don't align with the tokenization in the
| #[code Doc] object. The training algorithm will view these as missing
| values. #[code O] denotes a non-entity token. #[code B] denotes the
| beginning of a multi-token entity, #[code I] the inside of an entity
| of three or more tokens, and #[code L] the end of an entity of two or
| more tokens. #[code U] denotes a single-token entity.
+aside-code("Example").
from spacy.gold import biluo_tags_from_offsets
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell
| The document that the entity offsets refer to. The output tags
| will refer to the token boundaries within the document.
+row
+cell #[code entities]
+cell iterable
+cell
| A sequence of #[code (start, end, label)] triples. #[code start]
| and #[code end] should be character-offset integers denoting the
| slice into the original string.
+footrow
+cell returns
+cell list
+cell
| Unicode strings, describing the
| #[+a("/docs/api/annotation#biluo") BILUO] tags.