Update docstrings and API docs for Language class

2025-05-28 09:43:17 +03:00 · 2017-05-18 23:57:38 +02:00 · 2017-05-18 23:57:38 +02:00 · d42bc16868
commit d42bc16868
parent 593361ee3c
2 changed files with 394 additions and 153 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -115,14 +115,26 @@ class BaseDefaults(object):


 class Language(object):
-    """
-    A text-processing pipeline. Usually you'll load this once per process, and
-    pass the instance around your program.
+    """A text-processing pipeline. Usually you'll load this once per process,
+    and pass the instance around your application.
    """
    Defaults = BaseDefaults
    lang = None

    def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
+        """Initialise a Language object.
+
+        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
+            `Language.Defaults.create_vocab`.
+        make_doc (function): A function that takes text and returns a `Doc`
+            object. Usually a `Tokenizer`.
+        pipeline (list): A list of annotation processes or IDs of annotation,
+            processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
+            up in `Language.Defaults.factories`.
+        meta (dict): Custom meta data for the Language class. Is written to by
+            models to add model meta data.
+        RETURNS (Language): The newly constructed object.
+        """
        self.meta = dict(meta)

        if vocab is True:
@ -146,23 +158,17 @@ class Language(object):
            self.pipeline = []

    def __call__(self, text, state=None, **disabled):
-        """
-        Apply the pipeline to some text.  The text can span multiple sentences,
-        and can contain arbtrary whitespace.  Alignment into the original string
+        """Apply the pipeline to some text. The text can span multiple sentences,
+        and can contain arbtrary whitespace. Alignment into the original string
        is preserved.

-        Args:
-            text (unicode): The text to be processed.
-            state: Arbitrary
+        text (unicode): The text to be processed.
+        **disabled: Elements of the pipeline that should not be run.
+        RETURNS (Doc): A container for accessing the annotations.

-        Returns:
-            doc (Doc): A container for accessing the annotations.
-
-        Example:
-            >>> from spacy.en import English
-            >>> nlp = English()
+        EXAMPLE:
            >>> tokens = nlp('An example sentence. Another example sentence.')
-            >>> tokens[0].orth_, tokens[0].head.tag_
+            >>> tokens[0].text, tokens[0].head.tag_
            ('An', 'NN')
        """
        doc = self.make_doc(text)
@ -174,16 +180,28 @@ class Language(object):
        return doc

    def update(self, docs, golds, state=None, drop=0., sgd=None):
+        """Update the models in the pipeline.
+
+        docs (iterable): A batch of `Doc` objects.
+        golds (iterable): A batch of `GoldParse` objects.
+        drop (float): The droput rate.
+        sgd (function): An optimizer.
+        RETURNS (dict): Results from the update.
+
+        EXAMPLE:
+            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
+            >>>    for epoch in trainer.epochs(gold):
+            >>>        for docs, golds in epoch:
+            >>>            state = nlp.update(docs, golds, sgd=optimizer)
+        """
        grads = {}
        def get_grads(W, dW, key=None):
            grads[key] = (W, dW)
        state = {} if state is None else state
        for process in self.pipeline:
            if hasattr(process, 'update'):
-                state = process.update(docs, golds,
-                            state=state,
-                            drop=drop,
-                            sgd=get_grads)
+                state = process.update(docs, golds, state=state, drop=drop,
+                                                    sgd=get_grads)
            else:
                process(docs, state=state)
        if sgd is not None:
@ -198,6 +216,19 @@ class Language(object):

    @contextmanager
    def begin_training(self, gold_tuples, **cfg):
+        """Allocate models, pre-process training data and acquire a trainer and
+        optimizer. Used as a contextmanager.
+
+        gold_tuples (iterable): Gold-standard training data.
+        **cfg: Config parameters.
+        YIELDS (tuple): A trainer and an optimizer.
+
+        EXAMPLE:
+            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
+            >>>    for epoch in trainer.epochs(gold):
+            >>>        for docs, golds in epoch:
+            >>>            state = nlp.update(docs, golds, sgd=optimizer)
+        """
        # Populate vocab
        for _, annots_brackets in gold_tuples:
            for annots, _ in annots_brackets:
@ -220,6 +251,17 @@ class Language(object):

    @contextmanager
    def use_params(self, params, **cfg):
+        """Replace weights of models in the pipeline with those provided in the
+        params dictionary. Can be used as a contextmanager, in which case,
+        models go back to their original weights after the block.
+
+        params (dict): A dictionary of parameters keyed by model ID.
+        **cfg: Config parameters.
+
+        EXAMPLE:
+            >>> with nlp.use_params(optimizer.averages):
+            >>>     nlp.to_disk('/tmp/checkpoint')
+        """
        contexts = [pipe.use_params(params) for pipe
                    in self.pipeline if hasattr(pipe, 'use_params')]
        # TODO: Having trouble with contextlib
@ -237,16 +279,20 @@ class Language(object):
                pass

    def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
-        """
-        Process texts as a stream, and yield Doc objects in order.
+        """Process texts as a stream, and yield `Doc` objects in order. Supports
+        GIL-free multi-threading.

-        Supports GIL-free multi-threading.
+        texts (iterator): A sequence of texts to process.
+        n_threads (int): The number of worker threads to use. If -1, OpenMP will
+            decide how many to use at run time. Default is 2.
+        batch_size (int): The number of texts to buffer.
+        **disabled: Pipeline components to exclude.
+        YIELDS (Doc): Documents in the order of the original text.

-        Arguments:
-            texts (iterator)
-            tag (bool)
-            parse (bool)
-            entity (bool)
+        EXAMPLE:
+            >>> texts = [u'One document.', u'...', u'Lots of documents']
+            >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
+            >>>         assert doc.is_parsed
        """
        #stream = ((self.make_doc(text), None) for text in texts)
        stream = ((doc, {}) for doc in texts)
@ -254,7 +300,6 @@ class Language(object):
            name = getattr(proc, 'name', None)
            if name in disabled and not disabled[name]:
                continue
-
            if hasattr(proc, 'pipe'):
                stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size)
            else:
@ -265,11 +310,12 @@ class Language(object):
    def to_disk(self, path, **exclude):
        """Save the current state to a directory.

-        Args:
-            path: A path to a directory, which will be created if it doesn't
-                    exist. Paths may be either strings or pathlib.Path-like
-                    objects.
-            **exclude: Prevent named attributes from being saved.
+        path (unicode or Path): A path to a directory, which will be created if
+            it doesn't exist. Paths may be either strings or `Path`-like objects.
+        **exclude: Named attributes to prevent from being saved.
+
+        EXAMPLE:
+            >>> nlp.to_disk('/path/to/models')
        """
        path = util.ensure_path(path)
        if not path.exists():
@ -288,12 +334,17 @@ class Language(object):
            dill.dump(props, file_)

    def from_disk(self, path, **exclude):
-        """Load the current state from a directory.
+        """Loads state from a directory. Modifies the object in place and
+        returns it.

-        Args:
-            path: A path to a directory. Paths may be either strings or
-                pathlib.Path-like objects.
-            **exclude: Prevent named attributes from being saved.
+        path (unicode or Path): A path to a directory. Paths may be either
+            strings or `Path`-like objects.
+        **exclude: Named attributes to prevent from being loaded.
+        RETURNS (Language): The modified `Language` object.
+
+        EXAMPLE:
+            >>> from spacy.language import Language
+            >>> nlp = Language().from_disk('/path/to/models')
        """
        path = util.ensure_path(path)
        for name in path.iterdir():
@ -307,10 +358,8 @@ class Language(object):
    def to_bytes(self, **exclude):
        """Serialize the current state to a binary string.

-        Args:
-            path: A path to a directory. Paths may be either strings or
-                pathlib.Path-like objects.
-            **exclude: Prevent named attributes from being serialized.
+        **exclude: Named attributes to prevent from being serialized.
+        RETURNS (bytes): The serialized form of the `Language` object.
        """
        props = dict(self.__dict__)
        for key in exclude:
@ -321,9 +370,9 @@ class Language(object):
    def from_bytes(self, bytes_data, **exclude):
        """Load state from a binary string.

-        Args:
-            bytes_data (bytes): The data to load from.
-            **exclude: Prevent named attributes from being loaded.
+        bytes_data (bytes): The data to load from.
+        **exclude: Named attributes to prevent from being loaded.
+        RETURNS (Language): The `Language` object.
        """
        props = dill.loads(bytes_data)
        for key, value in props.items():
--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@ -2,7 +2,305 @@

 include ../../_includes/_mixins

-p A text processing pipeline.
+p
+    |  A text-processing pipeline. Usually you'll load this once per process,
+    |  and pass the instance around your application.
+
+h(2, "init") Language.__init__
+    +tag method
+
+p Initialise a #[code Language] object.
+
+aside-code("Example").
+    from spacy.language import Language
+    nlp = Language(pipeline=['token_vectors', 'tags',
+                             'dependencies'])
+
+    from spacy.lang.en import English
+    nlp = English()
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code vocab]
+        +cell #[code Vocab]
+        +cell
+            |  A #[code Vocab] object. If #[code True], a vocab is created via
+            |  #[code Language.Defaults.create_vocab].
+
+    +row
+        +cell #[code make_doc]
+        +cell function
+        +cell
+            |  A function that takes text and returns a #[code Doc] object.
+            |  Usually a #[code Tokenizer].
+
+    +row
+        +cell #[code pipeline]
+        +cell list
+        +cell
+            |  A list of annotation processes or IDs of annotation, processes,
+            |  e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
+            |  up in #[code Language.Defaults.factories].
+
+    +row
+        +cell #[code meta]
+        +cell dict
+        +cell
+            |  Custom meta data for the #[code Language] class. Is written to by
+            |  models to add model meta data.
+
+    +footrow
+        +cell return
+        +cell #[code Language]
+        +cell The newly constructed object.
+
+h(2, "call") Language.__call__
+    +tag method
+
+p
+    |  Apply the pipeline to some text. The text can span multiple sentences,
+    |  and can contain arbtrary whitespace. Alignment into the original string
+    |  is preserved.
+
+aside-code("Example").
+    tokens = nlp('An example sentence. Another example sentence.')
+    tokens[0].text, tokens[0].head.tag_
+    # ('An', 'NN')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code text]
+        +cell unicode
+        +cell The text to be processed.
+
+    +row
+        +cell #[code **disabled]
+        +cell -
+        +cell Elements of the pipeline that should not be run.
+
+    +footrow
+        +cell return
+        +cell #[code Doc]
+        +cell A container for accessing the annotations.
+
+h(2, "update") Language.update
+    +tag method
+
+p Update the models in the pipeline.
+
+aside-code("Example").
+    with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
+        for epoch in trainer.epochs(gold):
+            for docs, golds in epoch:
+                state = nlp.update(docs, golds, sgd=optimizer)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code docs]
+        +cell iterable
+        +cell A batch of #[code Doc] objects.
+
+    +row
+        +cell #[code golds]
+        +cell iterable
+        +cell A batch of #[code GoldParse] objects.
+
+    +row
+        +cell #[code drop]
+        +cell float
+        +cell The dropout rate.
+
+    +row
+        +cell #[code sgd]
+        +cell function
+        +cell An optimizer.
+
+    +footrow
+        +cell return
+        +cell dict
+        +cell Results from the update.
+
+h(2, "begin_training") Language.begin_training
+    +tag contextmanager
+
+p
+    |  Allocate models, pre-process training data and acquire a trainer and
+    |  optimizer. Used as a contextmanager.
+
+aside-code("Example").
+    with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
+        for epoch in trainer.epochs(gold):
+            for docs, golds in epoch:
+                state = nlp.update(docs, golds, sgd=optimizer)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code gold_tuples]
+        +cell iterable
+        +cell Gold-standard training data.
+
+    +row
+        +cell #[code **cfg]
+        +cell -
+        +cell Config parameters.
+
+    +footrow
+        +cell yield
+        +cell tuple
+        +cell A trainer and an optimizer.
+
+h(2, "use_params") Language.use_params
+    +tag contextmanager
+    +tag method
+
+p
+    |  Replace weights of models in the pipeline with those provided in the
+    |  params dictionary. Can be used as a contextmanager, in which case, models
+    |  go back to their original weights after the block.
+
+aside-code("Example").
+    with nlp.use_params(optimizer.averages):
+        nlp.to_disk('/tmp/checkpoint')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code params]
+        +cell dict
+        +cell A dictionary of parameters keyed by model ID.
+
+    +row
+        +cell #[code **cfg]
+        +cell -
+        +cell Config parameters.
+
+h(2, "pipe") Language.pipe
+    +tag method
+
+p
+    |  Process texts as a stream, and yield #[code Doc] objects in order.
+    |  Supports GIL-free multi-threading.
+
+aside-code("Example").
+    texts = [u'One document.', u'...', u'Lots of documents']
+    for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
+        assert doc.is_parsed
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code texts]
+        +cell -
+        +cell A sequence of unicode objects.
+
+    +row
+        +cell #[code n_threads]
+        +cell int
+        +cell
+            |  The number of worker threads to use. If #[code -1], OpenMP will
+            |  decide how many to use at run time. Default is #[code 2].
+
+    +row
+        +cell #[code batch_size]
+        +cell int
+        +cell The number of texts to buffer.
+
+    +footrow
+        +cell yield
+        +cell #[code Doc]
+        +cell Documents in the order of the original text.
+
+h(2, "to_disk") Language.to_disk
+    +tag method
+
+p Save the current state to a directory.
+
+aside-code("Example").
+    nlp.to_disk('/path/to/models')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code path]
+        +cell unicode or #[code Path]
+        +cell
+            |  A path to a directory, which will be created if it doesn't exist.
+            |  Paths may be either strings or #[code Path]-like objects.
+
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being saved.
+
+h(2, "from_disk") Language.from_disk
+    +tag method
+
+p Loads state from a directory. Modifies the object in place and returns it.
+
+aside-code("Example").
+    from spacy.language import Language
+    nlp = Language().from_disk('/path/to/models')
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code path]
+        +cell unicode or #[code Path]
+        +cell
+            |  A path to a directory. Paths may be either strings or
+            |  #[code Path]-like objects.
+
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being loaded.
+
+    +footrow
+        +cell return
+        +cell #[code Language]
+        +cell The modified #[code Language] object.
+
+h(2, "to_bytes") Language.to_bytes
+    +tag method
+
+p Serialize the current state to a binary string.
+
+aside-code("Example").
+    nlp_bytes = nlp.to_bytes()
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being serialized.
+
+    +footrow
+        +cell return
+        +cell bytes
+        +cell The serialized form of the #[code Language] object.
+
+h(2, "from_bytes") Language.from_bytes
+    +tag method
+
+p Load state from a binary string.
+
+aside-code("Example").
+    fron spacy.lang.en import English
+    nlp_bytes = nlp.to_bytes()
+    nlp2 = English()
+    nlp2.from_bytes(nlp_bytes)
+
+table(["Name", "Type", "Description"])
+    +row
+        +cell #[code bytes_data]
+        +cell bytes
+        +cell The data to load from.
+
+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being loaded.
+
+    +footrow
+        +cell return
+        +cell bytes
+        +cell The serialized form of the #[code Language] object.

 +h(2, "attributes") Attributes

@ -46,109 +344,3 @@ p A text processing pipeline.
        +cell #[code pipeline]
        +cell -
        +cell Sequence of annotation functions.
-
-
-+h(2, "init") Language.__init__
-    +tag method
-
-p Create or load the pipeline.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code **overrides]
-        +cell -
-        +cell Keyword arguments indicating which defaults to override.
-
-    +footrow
-        +cell return
-        +cell #[code Language]
-        +cell The newly constructed object.
-
-+h(2, "call") Language.__call__
-    +tag method
-
-p Apply the pipeline to a single text.
-
-+aside-code("Example").
-    from spacy.en import English
-    nlp = English()
-    doc = nlp('An example sentence. Another example sentence.')
-    doc[0].orth_, doc[0].head.tag_
-    # ('An', 'NN')
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code text]
-        +cell unicode
-        +cell The text to be processed.
-
-    +row
-        +cell #[code tag]
-        +cell bool
-        +cell Whether to apply the part-of-speech tagger.
-
-    +row
-        +cell #[code parse]
-        +cell bool
-        +cell Whether to apply the syntactic dependency parser.
-
-    +row
-        +cell #[code entity]
-        +cell bool
-        +cell Whether to apply the named entity recognizer.
-
-    +footrow
-        +cell return
-        +cell #[code Doc]
-        +cell A container for accessing the linguistic annotations.
-
-+h(2, "pipe") Language.pipe
-    +tag method
-
-p
-    |  Process texts as a stream, and yield #[code Doc] objects in order.
-    |  Supports GIL-free multi-threading.
-
-+aside-code("Example").
-    texts = [u'One document.', u'...', u'Lots of documents']
-    for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
-        assert doc.is_parsed
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code texts]
-        +cell -
-        +cell A sequence of unicode objects.
-
-    +row
-        +cell #[code n_threads]
-        +cell int
-        +cell
-            |  The number of worker threads to use. If #[code -1], OpenMP will
-            |  decide how many to use at run time. Default is #[code 2].
-
-    +row
-        +cell #[code batch_size]
-        +cell int
-        +cell The number of texts to buffer.
-
-    +footrow
-        +cell yield
-        +cell #[code Doc]
-        +cell Containers for accessing the linguistic annotations.
-
-+h(2, "save_to_directory") Language.save_to_directory
-    +tag method
-
-p Save the #[code Vocab], #[code StringStore] and pipeline to a directory.
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code path]
-        +cell string or pathlib path
-        +cell Path to save the model.
-
-    +footrow
-        +cell return
-        +cell #[code None]
-        +cell -