Update docstrings and API docs for Language class

This commit is contained in:
ines 2017-05-18 23:57:38 +02:00
parent 593361ee3c
commit d42bc16868
2 changed files with 394 additions and 153 deletions

View File

@ -115,14 +115,26 @@ class BaseDefaults(object):
class Language(object): class Language(object):
""" """A text-processing pipeline. Usually you'll load this once per process,
A text-processing pipeline. Usually you'll load this once per process, and and pass the instance around your application.
pass the instance around your program.
""" """
Defaults = BaseDefaults Defaults = BaseDefaults
lang = None lang = None
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}): def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
`Language.Defaults.create_vocab`.
make_doc (function): A function that takes text and returns a `Doc`
object. Usually a `Tokenizer`.
pipeline (list): A list of annotation processes or IDs of annotation,
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
up in `Language.Defaults.factories`.
meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data.
RETURNS (Language): The newly constructed object.
"""
self.meta = dict(meta) self.meta = dict(meta)
if vocab is True: if vocab is True:
@ -146,23 +158,17 @@ class Language(object):
self.pipeline = [] self.pipeline = []
def __call__(self, text, state=None, **disabled): def __call__(self, text, state=None, **disabled):
""" """Apply the pipeline to some text. The text can span multiple sentences,
Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string
and can contain arbtrary whitespace. Alignment into the original string
is preserved. is preserved.
Args: text (unicode): The text to be processed.
text (unicode): The text to be processed. **disabled: Elements of the pipeline that should not be run.
state: Arbitrary RETURNS (Doc): A container for accessing the annotations.
Returns: EXAMPLE:
doc (Doc): A container for accessing the annotations.
Example:
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp('An example sentence. Another example sentence.') >>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_ >>> tokens[0].text, tokens[0].head.tag_
('An', 'NN') ('An', 'NN')
""" """
doc = self.make_doc(text) doc = self.make_doc(text)
@ -174,16 +180,28 @@ class Language(object):
return doc return doc
def update(self, docs, golds, state=None, drop=0., sgd=None): def update(self, docs, golds, state=None, drop=0., sgd=None):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (function): An optimizer.
RETURNS (dict): Results from the update.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
grads = {} grads = {}
def get_grads(W, dW, key=None): def get_grads(W, dW, key=None):
grads[key] = (W, dW) grads[key] = (W, dW)
state = {} if state is None else state state = {} if state is None else state
for process in self.pipeline: for process in self.pipeline:
if hasattr(process, 'update'): if hasattr(process, 'update'):
state = process.update(docs, golds, state = process.update(docs, golds, state=state, drop=drop,
state=state, sgd=get_grads)
drop=drop,
sgd=get_grads)
else: else:
process(docs, state=state) process(docs, state=state)
if sgd is not None: if sgd is not None:
@ -198,6 +216,19 @@ class Language(object):
@contextmanager @contextmanager
def begin_training(self, gold_tuples, **cfg): def begin_training(self, gold_tuples, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager.
gold_tuples (iterable): Gold-standard training data.
**cfg: Config parameters.
YIELDS (tuple): A trainer and an optimizer.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
# Populate vocab # Populate vocab
for _, annots_brackets in gold_tuples: for _, annots_brackets in gold_tuples:
for annots, _ in annots_brackets: for annots, _ in annots_brackets:
@ -220,6 +251,17 @@ class Language(object):
@contextmanager @contextmanager
def use_params(self, params, **cfg): def use_params(self, params, **cfg):
"""Replace weights of models in the pipeline with those provided in the
params dictionary. Can be used as a contextmanager, in which case,
models go back to their original weights after the block.
params (dict): A dictionary of parameters keyed by model ID.
**cfg: Config parameters.
EXAMPLE:
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk('/tmp/checkpoint')
"""
contexts = [pipe.use_params(params) for pipe contexts = [pipe.use_params(params) for pipe
in self.pipeline if hasattr(pipe, 'use_params')] in self.pipeline if hasattr(pipe, 'use_params')]
# TODO: Having trouble with contextlib # TODO: Having trouble with contextlib
@ -237,16 +279,20 @@ class Language(object):
pass pass
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled): def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
""" """Process texts as a stream, and yield `Doc` objects in order. Supports
Process texts as a stream, and yield Doc objects in order. GIL-free multi-threading.
Supports GIL-free multi-threading. texts (iterator): A sequence of texts to process.
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
**disabled: Pipeline components to exclude.
YIELDS (Doc): Documents in the order of the original text.
Arguments: EXAMPLE:
texts (iterator) >>> texts = [u'One document.', u'...', u'Lots of documents']
tag (bool) >>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
parse (bool) >>> assert doc.is_parsed
entity (bool)
""" """
#stream = ((self.make_doc(text), None) for text in texts) #stream = ((self.make_doc(text), None) for text in texts)
stream = ((doc, {}) for doc in texts) stream = ((doc, {}) for doc in texts)
@ -254,7 +300,6 @@ class Language(object):
name = getattr(proc, 'name', None) name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]: if name in disabled and not disabled[name]:
continue continue
if hasattr(proc, 'pipe'): if hasattr(proc, 'pipe'):
stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size) stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size)
else: else:
@ -265,11 +310,12 @@ class Language(object):
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
"""Save the current state to a directory. """Save the current state to a directory.
Args: path (unicode or Path): A path to a directory, which will be created if
path: A path to a directory, which will be created if it doesn't it doesn't exist. Paths may be either strings or `Path`-like objects.
exist. Paths may be either strings or pathlib.Path-like **exclude: Named attributes to prevent from being saved.
objects.
**exclude: Prevent named attributes from being saved. EXAMPLE:
>>> nlp.to_disk('/path/to/models')
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
if not path.exists(): if not path.exists():
@ -288,12 +334,17 @@ class Language(object):
dill.dump(props, file_) dill.dump(props, file_)
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
"""Load the current state from a directory. """Loads state from a directory. Modifies the object in place and
returns it.
Args: path (unicode or Path): A path to a directory. Paths may be either
path: A path to a directory. Paths may be either strings or strings or `Path`-like objects.
pathlib.Path-like objects. **exclude: Named attributes to prevent from being loaded.
**exclude: Prevent named attributes from being saved. RETURNS (Language): The modified `Language` object.
EXAMPLE:
>>> from spacy.language import Language
>>> nlp = Language().from_disk('/path/to/models')
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
for name in path.iterdir(): for name in path.iterdir():
@ -307,10 +358,8 @@ class Language(object):
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
Args: **exclude: Named attributes to prevent from being serialized.
path: A path to a directory. Paths may be either strings or RETURNS (bytes): The serialized form of the `Language` object.
pathlib.Path-like objects.
**exclude: Prevent named attributes from being serialized.
""" """
props = dict(self.__dict__) props = dict(self.__dict__)
for key in exclude: for key in exclude:
@ -321,9 +370,9 @@ class Language(object):
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string. """Load state from a binary string.
Args: bytes_data (bytes): The data to load from.
bytes_data (bytes): The data to load from. **exclude: Named attributes to prevent from being loaded.
**exclude: Prevent named attributes from being loaded. RETURNS (Language): The `Language` object.
""" """
props = dill.loads(bytes_data) props = dill.loads(bytes_data)
for key, value in props.items(): for key, value in props.items():

View File

@ -2,7 +2,305 @@
include ../../_includes/_mixins include ../../_includes/_mixins
p A text processing pipeline. p
| A text-processing pipeline. Usually you'll load this once per process,
| and pass the instance around your application.
+h(2, "init") Language.__init__
+tag method
p Initialise a #[code Language] object.
+aside-code("Example").
from spacy.language import Language
nlp = Language(pipeline=['token_vectors', 'tags',
'dependencies'])
from spacy.lang.en import English
nlp = English()
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell
| A #[code Vocab] object. If #[code True], a vocab is created via
| #[code Language.Defaults.create_vocab].
+row
+cell #[code make_doc]
+cell function
+cell
| A function that takes text and returns a #[code Doc] object.
| Usually a #[code Tokenizer].
+row
+cell #[code pipeline]
+cell list
+cell
| A list of annotation processes or IDs of annotation, processes,
| e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
| up in #[code Language.Defaults.factories].
+row
+cell #[code meta]
+cell dict
+cell
| Custom meta data for the #[code Language] class. Is written to by
| models to add model meta data.
+footrow
+cell return
+cell #[code Language]
+cell The newly constructed object.
+h(2, "call") Language.__call__
+tag method
p
| Apply the pipeline to some text. The text can span multiple sentences,
| and can contain arbtrary whitespace. Alignment into the original string
| is preserved.
+aside-code("Example").
tokens = nlp('An example sentence. Another example sentence.')
tokens[0].text, tokens[0].head.tag_
# ('An', 'NN')
+table(["Name", "Type", "Description"])
+row
+cell #[code text]
+cell unicode
+cell The text to be processed.
+row
+cell #[code **disabled]
+cell -
+cell Elements of the pipeline that should not be run.
+footrow
+cell return
+cell #[code Doc]
+cell A container for accessing the annotations.
+h(2, "update") Language.update
+tag method
p Update the models in the pipeline.
+aside-code("Example").
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
for epoch in trainer.epochs(gold):
for docs, golds in epoch:
state = nlp.update(docs, golds, sgd=optimizer)
+table(["Name", "Type", "Description"])
+row
+cell #[code docs]
+cell iterable
+cell A batch of #[code Doc] objects.
+row
+cell #[code golds]
+cell iterable
+cell A batch of #[code GoldParse] objects.
+row
+cell #[code drop]
+cell float
+cell The dropout rate.
+row
+cell #[code sgd]
+cell function
+cell An optimizer.
+footrow
+cell return
+cell dict
+cell Results from the update.
+h(2, "begin_training") Language.begin_training
+tag contextmanager
p
| Allocate models, pre-process training data and acquire a trainer and
| optimizer. Used as a contextmanager.
+aside-code("Example").
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
for epoch in trainer.epochs(gold):
for docs, golds in epoch:
state = nlp.update(docs, golds, sgd=optimizer)
+table(["Name", "Type", "Description"])
+row
+cell #[code gold_tuples]
+cell iterable
+cell Gold-standard training data.
+row
+cell #[code **cfg]
+cell -
+cell Config parameters.
+footrow
+cell yield
+cell tuple
+cell A trainer and an optimizer.
+h(2, "use_params") Language.use_params
+tag contextmanager
+tag method
p
| Replace weights of models in the pipeline with those provided in the
| params dictionary. Can be used as a contextmanager, in which case, models
| go back to their original weights after the block.
+aside-code("Example").
with nlp.use_params(optimizer.averages):
nlp.to_disk('/tmp/checkpoint')
+table(["Name", "Type", "Description"])
+row
+cell #[code params]
+cell dict
+cell A dictionary of parameters keyed by model ID.
+row
+cell #[code **cfg]
+cell -
+cell Config parameters.
+h(2, "pipe") Language.pipe
+tag method
p
| Process texts as a stream, and yield #[code Doc] objects in order.
| Supports GIL-free multi-threading.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
assert doc.is_parsed
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode objects.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of worker threads to use. If #[code -1], OpenMP will
| decide how many to use at run time. Default is #[code 2].
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to buffer.
+footrow
+cell yield
+cell #[code Doc]
+cell Documents in the order of the original text.
+h(2, "to_disk") Language.to_disk
+tag method
p Save the current state to a directory.
+aside-code("Example").
nlp.to_disk('/path/to/models')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being saved.
+h(2, "from_disk") Language.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.language import Language
nlp = Language().from_disk('/path/to/models')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell return
+cell #[code Language]
+cell The modified #[code Language] object.
+h(2, "to_bytes") Language.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
nlp_bytes = nlp.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell return
+cell bytes
+cell The serialized form of the #[code Language] object.
+h(2, "from_bytes") Language.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.lang.en import English
nlp_bytes = nlp.to_bytes()
nlp2 = English()
nlp2.from_bytes(nlp_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell return
+cell bytes
+cell The serialized form of the #[code Language] object.
+h(2, "attributes") Attributes +h(2, "attributes") Attributes
@ -46,109 +344,3 @@ p A text processing pipeline.
+cell #[code pipeline] +cell #[code pipeline]
+cell - +cell -
+cell Sequence of annotation functions. +cell Sequence of annotation functions.
+h(2, "init") Language.__init__
+tag method
p Create or load the pipeline.
+table(["Name", "Type", "Description"])
+row
+cell #[code **overrides]
+cell -
+cell Keyword arguments indicating which defaults to override.
+footrow
+cell return
+cell #[code Language]
+cell The newly constructed object.
+h(2, "call") Language.__call__
+tag method
p Apply the pipeline to a single text.
+aside-code("Example").
from spacy.en import English
nlp = English()
doc = nlp('An example sentence. Another example sentence.')
doc[0].orth_, doc[0].head.tag_
# ('An', 'NN')
+table(["Name", "Type", "Description"])
+row
+cell #[code text]
+cell unicode
+cell The text to be processed.
+row
+cell #[code tag]
+cell bool
+cell Whether to apply the part-of-speech tagger.
+row
+cell #[code parse]
+cell bool
+cell Whether to apply the syntactic dependency parser.
+row
+cell #[code entity]
+cell bool
+cell Whether to apply the named entity recognizer.
+footrow
+cell return
+cell #[code Doc]
+cell A container for accessing the linguistic annotations.
+h(2, "pipe") Language.pipe
+tag method
p
| Process texts as a stream, and yield #[code Doc] objects in order.
| Supports GIL-free multi-threading.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
assert doc.is_parsed
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode objects.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of worker threads to use. If #[code -1], OpenMP will
| decide how many to use at run time. Default is #[code 2].
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to buffer.
+footrow
+cell yield
+cell #[code Doc]
+cell Containers for accessing the linguistic annotations.
+h(2, "save_to_directory") Language.save_to_directory
+tag method
p Save the #[code Vocab], #[code StringStore] and pipeline to a directory.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell string or pathlib path
+cell Path to save the model.
+footrow
+cell return
+cell #[code None]
+cell -