Update docstrings and API docs for GoldParse

This commit is contained in:
ines 2017-05-21 13:53:46 +02:00
parent 465a1dd710
commit 075f5ff87a
2 changed files with 95 additions and 58 deletions

View File

@ -225,25 +225,17 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False): deps=None, entities=None, make_projective=False):
""" """Create a GoldParse.
Create a GoldParse.
Arguments: doc (Doc): The document the annotations refer to.
doc (Doc): words (iterable): A sequence of unicode word strings.
The document the annotations refer to. tags (iterable): A sequence of strings, representing tag annotations.
words: heads (iterable): A sequence of integers, representing syntactic head offsets.
A sequence of unicode word strings. deps (iterable): A sequence of strings, representing the syntactic relation types.
tags: entities (iterable): A sequence of named entity annotations, either as
A sequence of strings, representing tag annotations. BILUO tag strings, or as `(start_char, end_char, label)` tuples,
heads: representing the entity positions.
A sequence of integers, representing syntactic head offsets. RETURNS (GoldParse): The newly constructed object.
deps:
A sequence of strings, representing the syntactic relation types.
entities:
A sequence of named entity annotations, either as BILUO tag strings,
or as (start_char, end_char, label) tuples, representing the entity
positions.
Returns (GoldParse): The newly constructed object.
""" """
if words is None: if words is None:
words = [token.text for token in doc] words = [token.text for token in doc]
@ -308,55 +300,45 @@ cdef class GoldParse:
self.heads = proj_heads self.heads = proj_heads
def __len__(self): def __len__(self):
""" """Get the number of gold-standard tokens.
Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens. RETURNS (int): The number of gold-standard tokens.
""" """
return self.length return self.length
@property @property
def is_projective(self): def is_projective(self):
""" """Whether the provided syntactic annotations form a projective
Whether the provided syntactic annotations form a projective dependency dependency tree.
tree.
""" """
return not nonproj.is_nonproj_tree(self.heads) return not nonproj.is_nonproj_tree(self.heads)
def biluo_tags_from_offsets(doc, entities): def biluo_tags_from_offsets(doc, entities):
""" """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (BILUO).
scheme (biluo).
Arguments: doc (Doc): The document that the entity offsets refer to. The output tags
doc (Doc): will refer to the token boundaries within the document.
The document that the entity offsets refer to. The output tags will entities (iterable): A sequence of `(start, end, label)` triples. `start` and
refer to the token boundaries within the document. `end` should be character-offset integers denoting the slice into the
original string.
entities (sequence): RETURNS (list): A list of unicode strings, describing the tags. Each tag
A sequence of (start, end, label) triples. start and end should be string will be of the form either "", "O" or "{action}-{label}", where
character-offset integers denoting the slice into the original string. action is one of "B", "I", "L", "U". The string "-" is used where the
entity offsets don't align with the tokenization in the `Doc` object. The
Returns: training algorithm will view these as missing values. "O" denotes a
tags (list): non-entity token. "B" denotes the beginning of a multi-token entity,
A list of unicode strings, describing the tags. Each tag string will
be of the form either "", "O" or "{action}-{label}", where action is one
of "B", "I", "L", "U". The string "-" is used where the entity
offsets don't align with the tokenization in the Doc object. The
training algorithm will view these as missing values. "O" denotes
a non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end "I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity. of an entity of two or more tokens. "U" denotes a single-token entity.
Example: EXAMPLE:
text = 'I like London.' >>> text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')] >>> entities = [(len('I like '), len('I like London'), 'LOC')]
doc = nlp.tokenizer(text) >>> doc = nlp.tokenizer(text)
>>> tags = biluo_tags_from_offsets(doc, entities)
tags = biluo_tags_from_offsets(doc, entities) >>> assert tags == ['O', 'O', 'U-LOC', 'O']
assert tags == ['O', 'O', 'U-LOC', 'O']
""" """
starts = {token.idx: token.i for token in doc} starts = {token.idx: token.i for token in doc}
ends = {token.idx+len(token): token.i for token in doc} ends = {token.idx+len(token): token.i for token in doc}

View File

@ -17,27 +17,27 @@ p Create a GoldParse.
+row +row
+cell #[code words] +cell #[code words]
+cell - +cell iterable
+cell A sequence of unicode word strings. +cell A sequence of unicode word strings.
+row +row
+cell #[code tags] +cell #[code tags]
+cell - +cell iterable
+cell A sequence of strings, representing tag annotations. +cell A sequence of strings, representing tag annotations.
+row +row
+cell #[code heads] +cell #[code heads]
+cell - +cell iterable
+cell A sequence of integers, representing syntactic head offsets. +cell A sequence of integers, representing syntactic head offsets.
+row +row
+cell #[code deps] +cell #[code deps]
+cell - +cell iterable
+cell A sequence of strings, representing the syntactic relation types. +cell A sequence of strings, representing the syntactic relation types.
+row +row
+cell #[code entities] +cell #[code entities]
+cell - +cell iterable
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions. +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
+footrow +footrow
@ -102,3 +102,58 @@ p
+cell #[code gold_to_cand] +cell #[code gold_to_cand]
+cell list +cell list
+cell The alignment from gold tokenization to candidate tokenization. +cell The alignment from gold tokenization to candidate tokenization.
+h(2, "util") Utilities
+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
+tag function
p
| Encode labelled spans into per-token tags, using the
| #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
p
| Returns a list of unicode strings, describing the tags. Each tag string
| will be of the form either #[code ""], #[code "O"] or
| #[code "{action}-{label}"], where action is one of #[code "B"],
| #[code "I"], #[code "L"], #[code "U"]. The string #[code "-"]
| is used where the entity offsets don't align with the tokenization in the
| #[code Doc] object. The training algorithm will view these as missing
| values. #[code O] denotes a non-entity token. #[code B] denotes the
| beginning of a multi-token entity, #[code I] the inside of an entity
| of three or more tokens, and #[code L] the end of an entity of two or
| more tokens. #[code U] denotes a single-token entity.
+aside-code("Example").
from spacy.gold import biluo_tags_from_offsets
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell
| The document that the entity offsets refer to. The output tags
| will refer to the token boundaries within the document.
+row
+cell #[code entities]
+cell iterable
+cell
| A sequence of #[code (start, end, label)] triples. #[code start]
| and #[code end] should be character-offset integers denoting the
| slice into the original string.
+footrow
+cell returns
+cell list
+cell
| Unicode strings, describing the
| #[+a("/docs/api/annotation#biluo") BILUO] tags.