mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
Update docstrings and API docs for GoldParse
This commit is contained in:
parent
465a1dd710
commit
075f5ff87a
|
@ -225,25 +225,17 @@ cdef class GoldParse:
|
||||||
|
|
||||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||||
deps=None, entities=None, make_projective=False):
|
deps=None, entities=None, make_projective=False):
|
||||||
"""
|
"""Create a GoldParse.
|
||||||
Create a GoldParse.
|
|
||||||
|
|
||||||
Arguments:
|
doc (Doc): The document the annotations refer to.
|
||||||
doc (Doc):
|
words (iterable): A sequence of unicode word strings.
|
||||||
The document the annotations refer to.
|
tags (iterable): A sequence of strings, representing tag annotations.
|
||||||
words:
|
heads (iterable): A sequence of integers, representing syntactic head offsets.
|
||||||
A sequence of unicode word strings.
|
deps (iterable): A sequence of strings, representing the syntactic relation types.
|
||||||
tags:
|
entities (iterable): A sequence of named entity annotations, either as
|
||||||
A sequence of strings, representing tag annotations.
|
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
||||||
heads:
|
representing the entity positions.
|
||||||
A sequence of integers, representing syntactic head offsets.
|
RETURNS (GoldParse): The newly constructed object.
|
||||||
deps:
|
|
||||||
A sequence of strings, representing the syntactic relation types.
|
|
||||||
entities:
|
|
||||||
A sequence of named entity annotations, either as BILUO tag strings,
|
|
||||||
or as (start_char, end_char, label) tuples, representing the entity
|
|
||||||
positions.
|
|
||||||
Returns (GoldParse): The newly constructed object.
|
|
||||||
"""
|
"""
|
||||||
if words is None:
|
if words is None:
|
||||||
words = [token.text for token in doc]
|
words = [token.text for token in doc]
|
||||||
|
@ -308,55 +300,45 @@ cdef class GoldParse:
|
||||||
self.heads = proj_heads
|
self.heads = proj_heads
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""
|
"""Get the number of gold-standard tokens.
|
||||||
Get the number of gold-standard tokens.
|
|
||||||
|
|
||||||
Returns (int): The number of gold-standard tokens.
|
RETURNS (int): The number of gold-standard tokens.
|
||||||
"""
|
"""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_projective(self):
|
def is_projective(self):
|
||||||
"""
|
"""Whether the provided syntactic annotations form a projective
|
||||||
Whether the provided syntactic annotations form a projective dependency
|
dependency tree.
|
||||||
tree.
|
|
||||||
"""
|
"""
|
||||||
return not nonproj.is_nonproj_tree(self.heads)
|
return not nonproj.is_nonproj_tree(self.heads)
|
||||||
|
|
||||||
|
|
||||||
def biluo_tags_from_offsets(doc, entities):
|
def biluo_tags_from_offsets(doc, entities):
|
||||||
"""
|
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||||
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
scheme (BILUO).
|
||||||
scheme (biluo).
|
|
||||||
|
|
||||||
Arguments:
|
doc (Doc): The document that the entity offsets refer to. The output tags
|
||||||
doc (Doc):
|
will refer to the token boundaries within the document.
|
||||||
The document that the entity offsets refer to. The output tags will
|
entities (iterable): A sequence of `(start, end, label)` triples. `start` and
|
||||||
refer to the token boundaries within the document.
|
`end` should be character-offset integers denoting the slice into the
|
||||||
|
original string.
|
||||||
|
|
||||||
entities (sequence):
|
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
||||||
A sequence of (start, end, label) triples. start and end should be
|
string will be of the form either "", "O" or "{action}-{label}", where
|
||||||
character-offset integers denoting the slice into the original string.
|
action is one of "B", "I", "L", "U". The string "-" is used where the
|
||||||
|
entity offsets don't align with the tokenization in the `Doc` object. The
|
||||||
|
training algorithm will view these as missing values. "O" denotes a
|
||||||
|
non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||||
|
"I" the inside of an entity of three or more tokens, and "L" the end
|
||||||
|
of an entity of two or more tokens. "U" denotes a single-token entity.
|
||||||
|
|
||||||
Returns:
|
EXAMPLE:
|
||||||
tags (list):
|
>>> text = 'I like London.'
|
||||||
A list of unicode strings, describing the tags. Each tag string will
|
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||||
be of the form either "", "O" or "{action}-{label}", where action is one
|
>>> doc = nlp.tokenizer(text)
|
||||||
of "B", "I", "L", "U". The string "-" is used where the entity
|
>>> tags = biluo_tags_from_offsets(doc, entities)
|
||||||
offsets don't align with the tokenization in the Doc object. The
|
>>> assert tags == ['O', 'O', 'U-LOC', 'O']
|
||||||
training algorithm will view these as missing values. "O" denotes
|
|
||||||
a non-entity token. "B" denotes the beginning of a multi-token entity,
|
|
||||||
"I" the inside of an entity of three or more tokens, and "L" the end
|
|
||||||
of an entity of two or more tokens. "U" denotes a single-token entity.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
text = 'I like London.'
|
|
||||||
entities = [(len('I like '), len('I like London'), 'LOC')]
|
|
||||||
doc = nlp.tokenizer(text)
|
|
||||||
|
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
|
||||||
|
|
||||||
assert tags == ['O', 'O', 'U-LOC', 'O']
|
|
||||||
"""
|
"""
|
||||||
starts = {token.idx: token.i for token in doc}
|
starts = {token.idx: token.i for token in doc}
|
||||||
ends = {token.idx+len(token): token.i for token in doc}
|
ends = {token.idx+len(token): token.i for token in doc}
|
||||||
|
|
|
@ -17,27 +17,27 @@ p Create a GoldParse.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code words]
|
+cell #[code words]
|
||||||
+cell -
|
+cell iterable
|
||||||
+cell A sequence of unicode word strings.
|
+cell A sequence of unicode word strings.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code tags]
|
+cell #[code tags]
|
||||||
+cell -
|
+cell iterable
|
||||||
+cell A sequence of strings, representing tag annotations.
|
+cell A sequence of strings, representing tag annotations.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code heads]
|
+cell #[code heads]
|
||||||
+cell -
|
+cell iterable
|
||||||
+cell A sequence of integers, representing syntactic head offsets.
|
+cell A sequence of integers, representing syntactic head offsets.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code deps]
|
+cell #[code deps]
|
||||||
+cell -
|
+cell iterable
|
||||||
+cell A sequence of strings, representing the syntactic relation types.
|
+cell A sequence of strings, representing the syntactic relation types.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code entities]
|
+cell #[code entities]
|
||||||
+cell -
|
+cell iterable
|
||||||
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
|
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
|
@ -102,3 +102,58 @@ p
|
||||||
+cell #[code gold_to_cand]
|
+cell #[code gold_to_cand]
|
||||||
+cell list
|
+cell list
|
||||||
+cell The alignment from gold tokenization to candidate tokenization.
|
+cell The alignment from gold tokenization to candidate tokenization.
|
||||||
|
|
||||||
|
|
||||||
|
+h(2, "util") Utilities
|
||||||
|
|
||||||
|
+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
|
||||||
|
+tag function
|
||||||
|
|
||||||
|
p
|
||||||
|
| Encode labelled spans into per-token tags, using the
|
||||||
|
| #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
|
||||||
|
|
||||||
|
p
|
||||||
|
| Returns a list of unicode strings, describing the tags. Each tag string
|
||||||
|
| will be of the form either #[code ""], #[code "O"] or
|
||||||
|
| #[code "{action}-{label}"], where action is one of #[code "B"],
|
||||||
|
| #[code "I"], #[code "L"], #[code "U"]. The string #[code "-"]
|
||||||
|
| is used where the entity offsets don't align with the tokenization in the
|
||||||
|
| #[code Doc] object. The training algorithm will view these as missing
|
||||||
|
| values. #[code O] denotes a non-entity token. #[code B] denotes the
|
||||||
|
| beginning of a multi-token entity, #[code I] the inside of an entity
|
||||||
|
| of three or more tokens, and #[code L] the end of an entity of two or
|
||||||
|
| more tokens. #[code U] denotes a single-token entity.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.gold import biluo_tags_from_offsets
|
||||||
|
text = 'I like London.'
|
||||||
|
entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||||
|
doc = tokenizer(text)
|
||||||
|
tags = biluo_tags_from_offsets(doc, entities)
|
||||||
|
assert tags == ['O', 'O', 'U-LOC', 'O']
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code doc]
|
||||||
|
+cell #[code Doc]
|
||||||
|
+cell
|
||||||
|
| The document that the entity offsets refer to. The output tags
|
||||||
|
| will refer to the token boundaries within the document.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code entities]
|
||||||
|
+cell iterable
|
||||||
|
+cell
|
||||||
|
| A sequence of #[code (start, end, label)] triples. #[code start]
|
||||||
|
| and #[code end] should be character-offset integers denoting the
|
||||||
|
| slice into the original string.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell list
|
||||||
|
+cell
|
||||||
|
| Unicode strings, describing the
|
||||||
|
| #[+a("/docs/api/annotation#biluo") BILUO] tags.
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user