Update docstrings and API docs for Language class

2025-08-24 14:04:56 +03:00 · 2017-05-18 23:57:38 +02:00 · 2017-05-18 23:57:38 +02:00 · d42bc16868
commit d42bc16868
parent 593361ee3c
2 changed files with 394 additions and 153 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -115,14 +115,26 @@ class BaseDefaults(object):
 class Language(object):
-    """
+    """A text-processing pipeline. Usually you'll load this once per process,
-    A text-processing pipeline. Usually you'll load this once per process, and
+    and pass the instance around your application.
    pass the instance around your program.
    """
    Defaults = BaseDefaults
    lang = None
    def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
        """Initialise a Language object.
        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
            `Language.Defaults.create_vocab`.
        make_doc (function): A function that takes text and returns a `Doc`
            object. Usually a `Tokenizer`.
        pipeline (list): A list of annotation processes or IDs of annotation,
            processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
            up in `Language.Defaults.factories`.
        meta (dict): Custom meta data for the Language class. Is written to by
            models to add model meta data.
        RETURNS (Language): The newly constructed object.
        """
        self.meta = dict(meta)
        if vocab is True:
@ -146,23 +158,17 @@ class Language(object):
            self.pipeline = []
    def __call__(self, text, state=None, **disabled):
-        """
+        """Apply the pipeline to some text. The text can span multiple sentences,
-        Apply the pipeline to some text.  The text can span multiple sentences,
+        and can contain arbtrary whitespace. Alignment into the original string
        and can contain arbtrary whitespace.  Alignment into the original string
        is preserved.
-        Args:
+        text (unicode): The text to be processed.
-            text (unicode): The text to be processed.
+        **disabled: Elements of the pipeline that should not be run.
-            state: Arbitrary
+        RETURNS (Doc): A container for accessing the annotations.
-        Returns:
+        EXAMPLE:
            doc (Doc): A container for accessing the annotations.
        Example:
            >>> from spacy.en import English
            >>> nlp = English()
            >>> tokens = nlp('An example sentence. Another example sentence.')
-            >>> tokens[0].orth_, tokens[0].head.tag_
+            >>> tokens[0].text, tokens[0].head.tag_
            ('An', 'NN')
        """
        doc = self.make_doc(text)
@ -174,16 +180,28 @@ class Language(object):
        return doc
    def update(self, docs, golds, state=None, drop=0., sgd=None):
        """Update the models in the pipeline.
        docs (iterable): A batch of `Doc` objects.
        golds (iterable): A batch of `GoldParse` objects.
        drop (float): The droput rate.
        sgd (function): An optimizer.
        RETURNS (dict): Results from the update.
        EXAMPLE:
            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
            >>>    for epoch in trainer.epochs(gold):
            >>>        for docs, golds in epoch:
            >>>            state = nlp.update(docs, golds, sgd=optimizer)
        """
        grads = {}
        def get_grads(W, dW, key=None):
            grads[key] = (W, dW)
        state = {} if state is None else state
        for process in self.pipeline:
            if hasattr(process, 'update'):
-                state = process.update(docs, golds,
+                state = process.update(docs, golds, state=state, drop=drop,
-                            state=state,
+                                                    sgd=get_grads)
                            drop=drop,
                            sgd=get_grads)
            else:
                process(docs, state=state)
        if sgd is not None:
@ -198,6 +216,19 @@ class Language(object):
    @contextmanager
    def begin_training(self, gold_tuples, **cfg):
        """Allocate models, pre-process training data and acquire a trainer and
        optimizer. Used as a contextmanager.
        gold_tuples (iterable): Gold-standard training data.
        **cfg: Config parameters.
        YIELDS (tuple): A trainer and an optimizer.
        EXAMPLE:
            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
            >>>    for epoch in trainer.epochs(gold):
            >>>        for docs, golds in epoch:
            >>>            state = nlp.update(docs, golds, sgd=optimizer)
        """
        # Populate vocab
        for _, annots_brackets in gold_tuples:
            for annots, _ in annots_brackets:
@ -220,6 +251,17 @@ class Language(object):
    @contextmanager
    def use_params(self, params, **cfg):
        """Replace weights of models in the pipeline with those provided in the
        params dictionary. Can be used as a contextmanager, in which case,
        models go back to their original weights after the block.
        params (dict): A dictionary of parameters keyed by model ID.
        **cfg: Config parameters.
        EXAMPLE:
            >>> with nlp.use_params(optimizer.averages):
            >>>     nlp.to_disk('/tmp/checkpoint')
        """
        contexts = [pipe.use_params(params) for pipe
                    in self.pipeline if hasattr(pipe, 'use_params')]
        # TODO: Having trouble with contextlib
@ -237,16 +279,20 @@ class Language(object):
                pass
    def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
-        """
+        """Process texts as a stream, and yield `Doc` objects in order. Supports
-        Process texts as a stream, and yield Doc objects in order.
+        GIL-free multi-threading.
-        Supports GIL-free multi-threading.
+        texts (iterator): A sequence of texts to process.
        n_threads (int): The number of worker threads to use. If -1, OpenMP will
            decide how many to use at run time. Default is 2.
        batch_size (int): The number of texts to buffer.
        **disabled: Pipeline components to exclude.
        YIELDS (Doc): Documents in the order of the original text.
-        Arguments:
+        EXAMPLE:
-            texts (iterator)
+            >>> texts = [u'One document.', u'...', u'Lots of documents']
-            tag (bool)
+            >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
-            parse (bool)
+            >>>         assert doc.is_parsed
            entity (bool)
        """
        #stream = ((self.make_doc(text), None) for text in texts)
        stream = ((doc, {}) for doc in texts)
@ -254,7 +300,6 @@ class Language(object):
            name = getattr(proc, 'name', None)
            if name in disabled and not disabled[name]:
                continue
            if hasattr(proc, 'pipe'):
                stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size)
            else:
@ -265,11 +310,12 @@ class Language(object):
    def to_disk(self, path, **exclude):
        """Save the current state to a directory.
-        Args:
+        path (unicode or Path): A path to a directory, which will be created if
-            path: A path to a directory, which will be created if it doesn't
+            it doesn't exist. Paths may be either strings or `Path`-like objects.
-                    exist. Paths may be either strings or pathlib.Path-like
+        **exclude: Named attributes to prevent from being saved.
-                    objects.
+
-            **exclude: Prevent named attributes from being saved.
+        EXAMPLE:
            >>> nlp.to_disk('/path/to/models')
        """
        path = util.ensure_path(path)
        if not path.exists():
@ -288,12 +334,17 @@ class Language(object):
            dill.dump(props, file_)
    def from_disk(self, path, **exclude):
-        """Load the current state from a directory.
+        """Loads state from a directory. Modifies the object in place and
        returns it.
-        Args:
+        path (unicode or Path): A path to a directory. Paths may be either
-            path: A path to a directory. Paths may be either strings or
+            strings or `Path`-like objects.
-                pathlib.Path-like objects.
+        **exclude: Named attributes to prevent from being loaded.
-            **exclude: Prevent named attributes from being saved.
+        RETURNS (Language): The modified `Language` object.
        EXAMPLE:
            >>> from spacy.language import Language
            >>> nlp = Language().from_disk('/path/to/models')
        """
        path = util.ensure_path(path)
        for name in path.iterdir():
@ -307,10 +358,8 @@ class Language(object):
    def to_bytes(self, **exclude):
        """Serialize the current state to a binary string.
-        Args:
+        **exclude: Named attributes to prevent from being serialized.
-            path: A path to a directory. Paths may be either strings or
+        RETURNS (bytes): The serialized form of the `Language` object.
                pathlib.Path-like objects.
            **exclude: Prevent named attributes from being serialized.
        """
        props = dict(self.__dict__)
        for key in exclude:
@ -321,9 +370,9 @@ class Language(object):
    def from_bytes(self, bytes_data, **exclude):
        """Load state from a binary string.
-        Args:
+        bytes_data (bytes): The data to load from.
-            bytes_data (bytes): The data to load from.
+        **exclude: Named attributes to prevent from being loaded.
-            **exclude: Prevent named attributes from being loaded.
+        RETURNS (Language): The `Language` object.
        """
        props = dill.loads(bytes_data)
        for key, value in props.items():
--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@ -2,7 +2,305 @@
 include ../../_includes/_mixins
-p A text processing pipeline.
+p
    |  A text-processing pipeline. Usually you'll load this once per process,
    |  and pass the instance around your application.
 +h(2, "init") Language.__init__
    +tag method
 p Initialise a #[code Language] object.
 +aside-code("Example").
    from spacy.language import Language
    nlp = Language(pipeline=['token_vectors', 'tags',
                             'dependencies'])
    from spacy.lang.en import English
    nlp = English()
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell
            |  A #[code Vocab] object. If #[code True], a vocab is created via
            |  #[code Language.Defaults.create_vocab].
    +row
        +cell #[code make_doc]
        +cell function
        +cell
            |  A function that takes text and returns a #[code Doc] object.
            |  Usually a #[code Tokenizer].
    +row
        +cell #[code pipeline]
        +cell list
        +cell
            |  A list of annotation processes or IDs of annotation, processes,
            |  e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
            |  up in #[code Language.Defaults.factories].
    +row
        +cell #[code meta]
        +cell dict
        +cell
            |  Custom meta data for the #[code Language] class. Is written to by
            |  models to add model meta data.
    +footrow
        +cell return
        +cell #[code Language]
        +cell The newly constructed object.
 +h(2, "call") Language.__call__
    +tag method
 p
    |  Apply the pipeline to some text. The text can span multiple sentences,
    |  and can contain arbtrary whitespace. Alignment into the original string
    |  is preserved.
 +aside-code("Example").
    tokens = nlp('An example sentence. Another example sentence.')
    tokens[0].text, tokens[0].head.tag_
    # ('An', 'NN')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code text]
        +cell unicode
        +cell The text to be processed.
    +row
        +cell #[code **disabled]
        +cell -
        +cell Elements of the pipeline that should not be run.
    +footrow
        +cell return
        +cell #[code Doc]
        +cell A container for accessing the annotations.
 +h(2, "update") Language.update
    +tag method
 p Update the models in the pipeline.
 +aside-code("Example").
    with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
        for epoch in trainer.epochs(gold):
            for docs, golds in epoch:
                state = nlp.update(docs, golds, sgd=optimizer)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code docs]
        +cell iterable
        +cell A batch of #[code Doc] objects.
    +row
        +cell #[code golds]
        +cell iterable
        +cell A batch of #[code GoldParse] objects.
    +row
        +cell #[code drop]
        +cell float
        +cell The dropout rate.
    +row
        +cell #[code sgd]
        +cell function
        +cell An optimizer.
    +footrow
        +cell return
        +cell dict
        +cell Results from the update.
 +h(2, "begin_training") Language.begin_training
    +tag contextmanager
 p
    |  Allocate models, pre-process training data and acquire a trainer and
    |  optimizer. Used as a contextmanager.
 +aside-code("Example").
    with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
        for epoch in trainer.epochs(gold):
            for docs, golds in epoch:
                state = nlp.update(docs, golds, sgd=optimizer)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code gold_tuples]
        +cell iterable
        +cell Gold-standard training data.
    +row
        +cell #[code **cfg]
        +cell -
        +cell Config parameters.
    +footrow
        +cell yield
        +cell tuple
        +cell A trainer and an optimizer.
 +h(2, "use_params") Language.use_params
    +tag contextmanager
    +tag method
 p
    |  Replace weights of models in the pipeline with those provided in the
    |  params dictionary. Can be used as a contextmanager, in which case, models
    |  go back to their original weights after the block.
 +aside-code("Example").
    with nlp.use_params(optimizer.averages):
        nlp.to_disk('/tmp/checkpoint')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code params]
        +cell dict
        +cell A dictionary of parameters keyed by model ID.
    +row
        +cell #[code **cfg]
        +cell -
        +cell Config parameters.
 +h(2, "pipe") Language.pipe
    +tag method
 p
    |  Process texts as a stream, and yield #[code Doc] objects in order.
    |  Supports GIL-free multi-threading.
 +aside-code("Example").
    texts = [u'One document.', u'...', u'Lots of documents']
    for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
        assert doc.is_parsed
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code texts]
        +cell -
        +cell A sequence of unicode objects.
    +row
        +cell #[code n_threads]
        +cell int
        +cell
            |  The number of worker threads to use. If #[code -1], OpenMP will
            |  decide how many to use at run time. Default is #[code 2].
    +row
        +cell #[code batch_size]
        +cell int
        +cell The number of texts to buffer.
    +footrow
        +cell yield
        +cell #[code Doc]
        +cell Documents in the order of the original text.
 +h(2, "to_disk") Language.to_disk
    +tag method
 p Save the current state to a directory.
 +aside-code("Example").
    nlp.to_disk('/path/to/models')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory, which will be created if it doesn't exist.
            |  Paths may be either strings or #[code Path]-like objects.
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being saved.
 +h(2, "from_disk") Language.from_disk
    +tag method
 p Loads state from a directory. Modifies the object in place and returns it.
 +aside-code("Example").
    from spacy.language import Language
    nlp = Language().from_disk('/path/to/models')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory. Paths may be either strings or
            |  #[code Path]-like objects.
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being loaded.
    +footrow
        +cell return
        +cell #[code Language]
        +cell The modified #[code Language] object.
 +h(2, "to_bytes") Language.to_bytes
    +tag method
 p Serialize the current state to a binary string.
 +aside-code("Example").
    nlp_bytes = nlp.to_bytes()
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being serialized.
    +footrow
        +cell return
        +cell bytes
        +cell The serialized form of the #[code Language] object.
 +h(2, "from_bytes") Language.from_bytes
    +tag method
 p Load state from a binary string.
 +aside-code("Example").
    fron spacy.lang.en import English
    nlp_bytes = nlp.to_bytes()
    nlp2 = English()
    nlp2.from_bytes(nlp_bytes)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code bytes_data]
        +cell bytes
        +cell The data to load from.
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being loaded.
    +footrow
        +cell return
        +cell bytes
        +cell The serialized form of the #[code Language] object.
 +h(2, "attributes") Attributes
@ -46,109 +344,3 @@ p A text processing pipeline.
        +cell #[code pipeline]
        +cell -
        +cell Sequence of annotation functions.
 +h(2, "init") Language.__init__
    +tag method
 p Create or load the pipeline.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code **overrides]
        +cell -
        +cell Keyword arguments indicating which defaults to override.
    +footrow
        +cell return
        +cell #[code Language]
        +cell The newly constructed object.
 +h(2, "call") Language.__call__
    +tag method
 p Apply the pipeline to a single text.
 +aside-code("Example").
    from spacy.en import English
    nlp = English()
    doc = nlp('An example sentence. Another example sentence.')
    doc[0].orth_, doc[0].head.tag_
    # ('An', 'NN')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code text]
        +cell unicode
        +cell The text to be processed.
    +row
        +cell #[code tag]
        +cell bool
        +cell Whether to apply the part-of-speech tagger.
    +row
        +cell #[code parse]
        +cell bool
        +cell Whether to apply the syntactic dependency parser.
    +row
        +cell #[code entity]
        +cell bool
        +cell Whether to apply the named entity recognizer.
    +footrow
        +cell return
        +cell #[code Doc]
        +cell A container for accessing the linguistic annotations.
 +h(2, "pipe") Language.pipe
    +tag method
 p
    |  Process texts as a stream, and yield #[code Doc] objects in order.
    |  Supports GIL-free multi-threading.
 +aside-code("Example").
    texts = [u'One document.', u'...', u'Lots of documents']
    for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
        assert doc.is_parsed
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code texts]
        +cell -
        +cell A sequence of unicode objects.
    +row
        +cell #[code n_threads]
        +cell int
        +cell
            |  The number of worker threads to use. If #[code -1], OpenMP will
            |  decide how many to use at run time. Default is #[code 2].
    +row
        +cell #[code batch_size]
        +cell int
        +cell The number of texts to buffer.
    +footrow
        +cell yield
        +cell #[code Doc]
        +cell Containers for accessing the linguistic annotations.
 +h(2, "save_to_directory") Language.save_to_directory
    +tag method
 p Save the #[code Vocab], #[code StringStore] and pipeline to a directory.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell string or pathlib path
        +cell Path to save the model.
    +footrow
        +cell return
        +cell #[code None]
        +cell -