Update docstrings and API docs for GoldParse

2025-07-21 21:49:49 +03:00 · 2017-05-21 13:53:46 +02:00 · 2017-05-21 13:53:46 +02:00 · 075f5ff87a
commit 075f5ff87a
parent 465a1dd710
2 changed files with 95 additions and 58 deletions
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -225,25 +225,17 @@ cdef class GoldParse:
    def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
                 deps=None, entities=None, make_projective=False):
-        """
+        """Create a GoldParse.
        Create a GoldParse.
-        Arguments:
+        doc (Doc): The document the annotations refer to.
-            doc (Doc):
+        words (iterable): A sequence of unicode word strings.
-                The document the annotations refer to.
+        tags (iterable): A sequence of strings, representing tag annotations.
-            words:
+        heads (iterable): A sequence of integers, representing syntactic head offsets.
-                A sequence of unicode word strings.
+        deps (iterable): A sequence of strings, representing the syntactic relation types.
-            tags:
+        entities (iterable): A sequence of named entity annotations, either as
-                A sequence of strings, representing tag annotations.
+            BILUO tag strings, or as `(start_char, end_char, label)` tuples,
-            heads:
+            representing the entity positions.
-                A sequence of integers, representing syntactic head offsets.
+        RETURNS (GoldParse): The newly constructed object.
            deps:
                A sequence of strings, representing the syntactic relation types.
            entities:
                A sequence of named entity annotations, either as BILUO tag strings,
                or as (start_char, end_char, label) tuples, representing the entity
                positions.
        Returns (GoldParse): The newly constructed object.
        """
        if words is None:
            words = [token.text for token in doc]
@ -308,55 +300,45 @@ cdef class GoldParse:
            self.heads = proj_heads
    def __len__(self):
-        """
+        """Get the number of gold-standard tokens.
        Get the number of gold-standard tokens.
-        Returns (int): The number of gold-standard tokens.
+        RETURNS (int): The number of gold-standard tokens.
        """
        return self.length
    @property
    def is_projective(self):
-        """
+        """Whether the provided syntactic annotations form a projective
-        Whether the provided syntactic annotations form a projective dependency
+        dependency tree.
        tree.
        """
        return not nonproj.is_nonproj_tree(self.heads)
 def biluo_tags_from_offsets(doc, entities):
-    """
+    """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
-    Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
+    scheme (BILUO).
    scheme (biluo).
-    Arguments:
+    doc (Doc): The document that the entity offsets refer to. The output tags
-        doc (Doc):
+        will refer to the token boundaries within the document.
-            The document that the entity offsets refer to. The output tags will
+    entities (iterable): A sequence of `(start, end, label)` triples. `start` and
-            refer to the token boundaries within the document.
+        `end` should be character-offset integers denoting the slice into the
        original string.
-        entities (sequence):
+    RETURNS (list): A list of unicode strings, describing the tags. Each tag
-            A sequence of (start, end, label) triples. start and end should be
+        string will be of the form either "", "O" or "{action}-{label}", where
-            character-offset integers denoting the slice into the original string.
+        action is one of "B", "I", "L", "U". The string "-" is used where the
-
+        entity offsets don't align with the tokenization in the `Doc` object. The
-    Returns:
+        training algorithm will view these as missing values. "O" denotes a
-        tags (list):
+        non-entity token. "B" denotes the beginning of a multi-token entity,
            A list of unicode strings, describing the tags. Each tag string will
            be of the form either "", "O" or "{action}-{label}", where action is one
            of "B", "I", "L", "U". The string "-" is used where the entity
            offsets don't align with the tokenization in the Doc object. The
            training algorithm will view these as missing values. "O" denotes
            a non-entity token. "B" denotes the beginning of a multi-token entity,
        "I" the inside of an entity of three or more tokens, and "L" the end
        of an entity of two or more tokens. "U" denotes a single-token entity.
-    Example:
+    EXAMPLE:
-        text = 'I like London.'
+        >>> text = 'I like London.'
-        entities = [(len('I like '), len('I like London'), 'LOC')]
+        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
-        doc = nlp.tokenizer(text)
+        >>> doc = nlp.tokenizer(text)
-
+        >>> tags = biluo_tags_from_offsets(doc, entities)
-        tags = biluo_tags_from_offsets(doc, entities)
+        >>> assert tags == ['O', 'O', 'U-LOC', 'O']
        assert tags == ['O', 'O', 'U-LOC', 'O']
    """
    starts = {token.idx: token.i for token in doc}
    ends = {token.idx+len(token): token.i for token in doc}
--- a/website/docs/api/goldparse.jade
+++ b/website/docs/api/goldparse.jade
@ -17,27 +17,27 @@ p Create a GoldParse.
    +row
        +cell #[code words]
-        +cell -
+        +cell iterable
        +cell A sequence of unicode word strings.
    +row
        +cell #[code tags]
-        +cell -
+        +cell iterable
        +cell A sequence of strings, representing tag annotations.
    +row
        +cell #[code heads]
-        +cell -
+        +cell iterable
        +cell A sequence of integers, representing syntactic head offsets.
    +row
        +cell #[code deps]
-        +cell -
+        +cell iterable
        +cell A sequence of strings, representing the syntactic relation types.
    +row
        +cell #[code entities]
-        +cell -
+        +cell iterable
        +cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
    +footrow
@ -102,3 +102,58 @@ p
        +cell #[code gold_to_cand]
        +cell list
        +cell The alignment from gold tokenization to candidate tokenization.
 +h(2, "util") Utilities
 +h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
    +tag function
 p
    |  Encode labelled spans into per-token tags, using the
    |  #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
 p
    |  Returns a list of unicode strings, describing the tags. Each tag string
    |  will be of the form either #[code ""], #[code "O"] or
    |  #[code "{action}-{label}"], where action is one of #[code "B"],
    |  #[code "I"], #[code "L"], #[code "U"]. The string #[code &quot;-&quot;]
    |  is used where the entity offsets don't align with the tokenization in the
    |  #[code Doc] object. The training algorithm will view these as missing
    |  values. #[code O] denotes a non-entity token. #[code B] denotes the
    |  beginning of a multi-token entity, #[code I] the inside of an entity
    |  of three or more tokens, and #[code L] the end of an entity of two or
    |  more tokens. #[code U] denotes a single-token entity.
 +aside-code("Example").
    from spacy.gold import biluo_tags_from_offsets
    text = 'I like London.'
    entities = [(len('I like '), len('I like London'), 'LOC')]
    doc = tokenizer(text)
    tags = biluo_tags_from_offsets(doc, entities)
    assert tags == ['O', 'O', 'U-LOC', 'O']
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell
            |  The document that the entity offsets refer to. The output tags
            |  will refer to the token boundaries within the document.
    +row
        +cell #[code entities]
        +cell iterable
        +cell
            |  A sequence of #[code (start, end, label)] triples. #[code start]
            |  and #[code end] should be character-offset integers denoting the
            |  slice into the original string.
    +footrow
        +cell returns
        +cell list
        +cell
            |  Unicode strings, describing the
            |  #[+a("/docs/api/annotation#biluo") BILUO] tags.