mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Update docstrings and API docs for Language class
This commit is contained in:
parent
593361ee3c
commit
d42bc16868
|
@ -115,14 +115,26 @@ class BaseDefaults(object):
|
||||||
|
|
||||||
|
|
||||||
class Language(object):
|
class Language(object):
|
||||||
"""
|
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||||
A text-processing pipeline. Usually you'll load this once per process, and
|
and pass the instance around your application.
|
||||||
pass the instance around your program.
|
|
||||||
"""
|
"""
|
||||||
Defaults = BaseDefaults
|
Defaults = BaseDefaults
|
||||||
lang = None
|
lang = None
|
||||||
|
|
||||||
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
|
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
|
||||||
|
"""Initialise a Language object.
|
||||||
|
|
||||||
|
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
|
||||||
|
`Language.Defaults.create_vocab`.
|
||||||
|
make_doc (function): A function that takes text and returns a `Doc`
|
||||||
|
object. Usually a `Tokenizer`.
|
||||||
|
pipeline (list): A list of annotation processes or IDs of annotation,
|
||||||
|
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
|
||||||
|
up in `Language.Defaults.factories`.
|
||||||
|
meta (dict): Custom meta data for the Language class. Is written to by
|
||||||
|
models to add model meta data.
|
||||||
|
RETURNS (Language): The newly constructed object.
|
||||||
|
"""
|
||||||
self.meta = dict(meta)
|
self.meta = dict(meta)
|
||||||
|
|
||||||
if vocab is True:
|
if vocab is True:
|
||||||
|
@ -146,23 +158,17 @@ class Language(object):
|
||||||
self.pipeline = []
|
self.pipeline = []
|
||||||
|
|
||||||
def __call__(self, text, state=None, **disabled):
|
def __call__(self, text, state=None, **disabled):
|
||||||
"""
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||||
Apply the pipeline to some text. The text can span multiple sentences,
|
and can contain arbtrary whitespace. Alignment into the original string
|
||||||
and can contain arbtrary whitespace. Alignment into the original string
|
|
||||||
is preserved.
|
is preserved.
|
||||||
|
|
||||||
Args:
|
text (unicode): The text to be processed.
|
||||||
text (unicode): The text to be processed.
|
**disabled: Elements of the pipeline that should not be run.
|
||||||
state: Arbitrary
|
RETURNS (Doc): A container for accessing the annotations.
|
||||||
|
|
||||||
Returns:
|
EXAMPLE:
|
||||||
doc (Doc): A container for accessing the annotations.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
>>> from spacy.en import English
|
|
||||||
>>> nlp = English()
|
|
||||||
>>> tokens = nlp('An example sentence. Another example sentence.')
|
>>> tokens = nlp('An example sentence. Another example sentence.')
|
||||||
>>> tokens[0].orth_, tokens[0].head.tag_
|
>>> tokens[0].text, tokens[0].head.tag_
|
||||||
('An', 'NN')
|
('An', 'NN')
|
||||||
"""
|
"""
|
||||||
doc = self.make_doc(text)
|
doc = self.make_doc(text)
|
||||||
|
@ -174,16 +180,28 @@ class Language(object):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def update(self, docs, golds, state=None, drop=0., sgd=None):
|
def update(self, docs, golds, state=None, drop=0., sgd=None):
|
||||||
|
"""Update the models in the pipeline.
|
||||||
|
|
||||||
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
golds (iterable): A batch of `GoldParse` objects.
|
||||||
|
drop (float): The droput rate.
|
||||||
|
sgd (function): An optimizer.
|
||||||
|
RETURNS (dict): Results from the update.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||||
|
>>> for epoch in trainer.epochs(gold):
|
||||||
|
>>> for docs, golds in epoch:
|
||||||
|
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||||
|
"""
|
||||||
grads = {}
|
grads = {}
|
||||||
def get_grads(W, dW, key=None):
|
def get_grads(W, dW, key=None):
|
||||||
grads[key] = (W, dW)
|
grads[key] = (W, dW)
|
||||||
state = {} if state is None else state
|
state = {} if state is None else state
|
||||||
for process in self.pipeline:
|
for process in self.pipeline:
|
||||||
if hasattr(process, 'update'):
|
if hasattr(process, 'update'):
|
||||||
state = process.update(docs, golds,
|
state = process.update(docs, golds, state=state, drop=drop,
|
||||||
state=state,
|
sgd=get_grads)
|
||||||
drop=drop,
|
|
||||||
sgd=get_grads)
|
|
||||||
else:
|
else:
|
||||||
process(docs, state=state)
|
process(docs, state=state)
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
|
@ -198,6 +216,19 @@ class Language(object):
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def begin_training(self, gold_tuples, **cfg):
|
def begin_training(self, gold_tuples, **cfg):
|
||||||
|
"""Allocate models, pre-process training data and acquire a trainer and
|
||||||
|
optimizer. Used as a contextmanager.
|
||||||
|
|
||||||
|
gold_tuples (iterable): Gold-standard training data.
|
||||||
|
**cfg: Config parameters.
|
||||||
|
YIELDS (tuple): A trainer and an optimizer.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||||
|
>>> for epoch in trainer.epochs(gold):
|
||||||
|
>>> for docs, golds in epoch:
|
||||||
|
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||||
|
"""
|
||||||
# Populate vocab
|
# Populate vocab
|
||||||
for _, annots_brackets in gold_tuples:
|
for _, annots_brackets in gold_tuples:
|
||||||
for annots, _ in annots_brackets:
|
for annots, _ in annots_brackets:
|
||||||
|
@ -220,6 +251,17 @@ class Language(object):
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def use_params(self, params, **cfg):
|
def use_params(self, params, **cfg):
|
||||||
|
"""Replace weights of models in the pipeline with those provided in the
|
||||||
|
params dictionary. Can be used as a contextmanager, in which case,
|
||||||
|
models go back to their original weights after the block.
|
||||||
|
|
||||||
|
params (dict): A dictionary of parameters keyed by model ID.
|
||||||
|
**cfg: Config parameters.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
>>> with nlp.use_params(optimizer.averages):
|
||||||
|
>>> nlp.to_disk('/tmp/checkpoint')
|
||||||
|
"""
|
||||||
contexts = [pipe.use_params(params) for pipe
|
contexts = [pipe.use_params(params) for pipe
|
||||||
in self.pipeline if hasattr(pipe, 'use_params')]
|
in self.pipeline if hasattr(pipe, 'use_params')]
|
||||||
# TODO: Having trouble with contextlib
|
# TODO: Having trouble with contextlib
|
||||||
|
@ -237,16 +279,20 @@ class Language(object):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
|
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
|
||||||
"""
|
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
||||||
Process texts as a stream, and yield Doc objects in order.
|
GIL-free multi-threading.
|
||||||
|
|
||||||
Supports GIL-free multi-threading.
|
texts (iterator): A sequence of texts to process.
|
||||||
|
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
||||||
|
decide how many to use at run time. Default is 2.
|
||||||
|
batch_size (int): The number of texts to buffer.
|
||||||
|
**disabled: Pipeline components to exclude.
|
||||||
|
YIELDS (Doc): Documents in the order of the original text.
|
||||||
|
|
||||||
Arguments:
|
EXAMPLE:
|
||||||
texts (iterator)
|
>>> texts = [u'One document.', u'...', u'Lots of documents']
|
||||||
tag (bool)
|
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
||||||
parse (bool)
|
>>> assert doc.is_parsed
|
||||||
entity (bool)
|
|
||||||
"""
|
"""
|
||||||
#stream = ((self.make_doc(text), None) for text in texts)
|
#stream = ((self.make_doc(text), None) for text in texts)
|
||||||
stream = ((doc, {}) for doc in texts)
|
stream = ((doc, {}) for doc in texts)
|
||||||
|
@ -254,7 +300,6 @@ class Language(object):
|
||||||
name = getattr(proc, 'name', None)
|
name = getattr(proc, 'name', None)
|
||||||
if name in disabled and not disabled[name]:
|
if name in disabled and not disabled[name]:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if hasattr(proc, 'pipe'):
|
if hasattr(proc, 'pipe'):
|
||||||
stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size)
|
stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size)
|
||||||
else:
|
else:
|
||||||
|
@ -265,11 +310,12 @@ class Language(object):
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
Args:
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
path: A path to a directory, which will be created if it doesn't
|
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||||
exist. Paths may be either strings or pathlib.Path-like
|
**exclude: Named attributes to prevent from being saved.
|
||||||
objects.
|
|
||||||
**exclude: Prevent named attributes from being saved.
|
EXAMPLE:
|
||||||
|
>>> nlp.to_disk('/path/to/models')
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
if not path.exists():
|
if not path.exists():
|
||||||
|
@ -288,12 +334,17 @@ class Language(object):
|
||||||
dill.dump(props, file_)
|
dill.dump(props, file_)
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
"""Load the current state from a directory.
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
|
returns it.
|
||||||
|
|
||||||
Args:
|
path (unicode or Path): A path to a directory. Paths may be either
|
||||||
path: A path to a directory. Paths may be either strings or
|
strings or `Path`-like objects.
|
||||||
pathlib.Path-like objects.
|
**exclude: Named attributes to prevent from being loaded.
|
||||||
**exclude: Prevent named attributes from being saved.
|
RETURNS (Language): The modified `Language` object.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
>>> from spacy.language import Language
|
||||||
|
>>> nlp = Language().from_disk('/path/to/models')
|
||||||
"""
|
"""
|
||||||
path = util.ensure_path(path)
|
path = util.ensure_path(path)
|
||||||
for name in path.iterdir():
|
for name in path.iterdir():
|
||||||
|
@ -307,10 +358,8 @@ class Language(object):
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
"""Serialize the current state to a binary string.
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
Args:
|
**exclude: Named attributes to prevent from being serialized.
|
||||||
path: A path to a directory. Paths may be either strings or
|
RETURNS (bytes): The serialized form of the `Language` object.
|
||||||
pathlib.Path-like objects.
|
|
||||||
**exclude: Prevent named attributes from being serialized.
|
|
||||||
"""
|
"""
|
||||||
props = dict(self.__dict__)
|
props = dict(self.__dict__)
|
||||||
for key in exclude:
|
for key in exclude:
|
||||||
|
@ -321,9 +370,9 @@ class Language(object):
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
"""Load state from a binary string.
|
"""Load state from a binary string.
|
||||||
|
|
||||||
Args:
|
bytes_data (bytes): The data to load from.
|
||||||
bytes_data (bytes): The data to load from.
|
**exclude: Named attributes to prevent from being loaded.
|
||||||
**exclude: Prevent named attributes from being loaded.
|
RETURNS (Language): The `Language` object.
|
||||||
"""
|
"""
|
||||||
props = dill.loads(bytes_data)
|
props = dill.loads(bytes_data)
|
||||||
for key, value in props.items():
|
for key, value in props.items():
|
||||||
|
|
|
@ -2,7 +2,305 @@
|
||||||
|
|
||||||
include ../../_includes/_mixins
|
include ../../_includes/_mixins
|
||||||
|
|
||||||
p A text processing pipeline.
|
p
|
||||||
|
| A text-processing pipeline. Usually you'll load this once per process,
|
||||||
|
| and pass the instance around your application.
|
||||||
|
|
||||||
|
+h(2, "init") Language.__init__
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Initialise a #[code Language] object.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.language import Language
|
||||||
|
nlp = Language(pipeline=['token_vectors', 'tags',
|
||||||
|
'dependencies'])
|
||||||
|
|
||||||
|
from spacy.lang.en import English
|
||||||
|
nlp = English()
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code vocab]
|
||||||
|
+cell #[code Vocab]
|
||||||
|
+cell
|
||||||
|
| A #[code Vocab] object. If #[code True], a vocab is created via
|
||||||
|
| #[code Language.Defaults.create_vocab].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code make_doc]
|
||||||
|
+cell function
|
||||||
|
+cell
|
||||||
|
| A function that takes text and returns a #[code Doc] object.
|
||||||
|
| Usually a #[code Tokenizer].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code pipeline]
|
||||||
|
+cell list
|
||||||
|
+cell
|
||||||
|
| A list of annotation processes or IDs of annotation, processes,
|
||||||
|
| e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
|
||||||
|
| up in #[code Language.Defaults.factories].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code meta]
|
||||||
|
+cell dict
|
||||||
|
+cell
|
||||||
|
| Custom meta data for the #[code Language] class. Is written to by
|
||||||
|
| models to add model meta data.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell #[code Language]
|
||||||
|
+cell The newly constructed object.
|
||||||
|
|
||||||
|
+h(2, "call") Language.__call__
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Apply the pipeline to some text. The text can span multiple sentences,
|
||||||
|
| and can contain arbtrary whitespace. Alignment into the original string
|
||||||
|
| is preserved.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
tokens = nlp('An example sentence. Another example sentence.')
|
||||||
|
tokens[0].text, tokens[0].head.tag_
|
||||||
|
# ('An', 'NN')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code text]
|
||||||
|
+cell unicode
|
||||||
|
+cell The text to be processed.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **disabled]
|
||||||
|
+cell -
|
||||||
|
+cell Elements of the pipeline that should not be run.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell #[code Doc]
|
||||||
|
+cell A container for accessing the annotations.
|
||||||
|
|
||||||
|
+h(2, "update") Language.update
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Update the models in the pipeline.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||||
|
for epoch in trainer.epochs(gold):
|
||||||
|
for docs, golds in epoch:
|
||||||
|
state = nlp.update(docs, golds, sgd=optimizer)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code docs]
|
||||||
|
+cell iterable
|
||||||
|
+cell A batch of #[code Doc] objects.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code golds]
|
||||||
|
+cell iterable
|
||||||
|
+cell A batch of #[code GoldParse] objects.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code drop]
|
||||||
|
+cell float
|
||||||
|
+cell The dropout rate.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code sgd]
|
||||||
|
+cell function
|
||||||
|
+cell An optimizer.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell dict
|
||||||
|
+cell Results from the update.
|
||||||
|
|
||||||
|
+h(2, "begin_training") Language.begin_training
|
||||||
|
+tag contextmanager
|
||||||
|
|
||||||
|
p
|
||||||
|
| Allocate models, pre-process training data and acquire a trainer and
|
||||||
|
| optimizer. Used as a contextmanager.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||||
|
for epoch in trainer.epochs(gold):
|
||||||
|
for docs, golds in epoch:
|
||||||
|
state = nlp.update(docs, golds, sgd=optimizer)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code gold_tuples]
|
||||||
|
+cell iterable
|
||||||
|
+cell Gold-standard training data.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **cfg]
|
||||||
|
+cell -
|
||||||
|
+cell Config parameters.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell yield
|
||||||
|
+cell tuple
|
||||||
|
+cell A trainer and an optimizer.
|
||||||
|
|
||||||
|
+h(2, "use_params") Language.use_params
|
||||||
|
+tag contextmanager
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Replace weights of models in the pipeline with those provided in the
|
||||||
|
| params dictionary. Can be used as a contextmanager, in which case, models
|
||||||
|
| go back to their original weights after the block.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
with nlp.use_params(optimizer.averages):
|
||||||
|
nlp.to_disk('/tmp/checkpoint')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code params]
|
||||||
|
+cell dict
|
||||||
|
+cell A dictionary of parameters keyed by model ID.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **cfg]
|
||||||
|
+cell -
|
||||||
|
+cell Config parameters.
|
||||||
|
|
||||||
|
+h(2, "pipe") Language.pipe
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p
|
||||||
|
| Process texts as a stream, and yield #[code Doc] objects in order.
|
||||||
|
| Supports GIL-free multi-threading.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
texts = [u'One document.', u'...', u'Lots of documents']
|
||||||
|
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
||||||
|
assert doc.is_parsed
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code texts]
|
||||||
|
+cell -
|
||||||
|
+cell A sequence of unicode objects.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code n_threads]
|
||||||
|
+cell int
|
||||||
|
+cell
|
||||||
|
| The number of worker threads to use. If #[code -1], OpenMP will
|
||||||
|
| decide how many to use at run time. Default is #[code 2].
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code batch_size]
|
||||||
|
+cell int
|
||||||
|
+cell The number of texts to buffer.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell yield
|
||||||
|
+cell #[code Doc]
|
||||||
|
+cell Documents in the order of the original text.
|
||||||
|
|
||||||
|
+h(2, "to_disk") Language.to_disk
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Save the current state to a directory.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
nlp.to_disk('/path/to/models')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code path]
|
||||||
|
+cell unicode or #[code Path]
|
||||||
|
+cell
|
||||||
|
| A path to a directory, which will be created if it doesn't exist.
|
||||||
|
| Paths may be either strings or #[code Path]-like objects.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **exclude]
|
||||||
|
+cell -
|
||||||
|
+cell Named attributes to prevent from being saved.
|
||||||
|
|
||||||
|
+h(2, "from_disk") Language.from_disk
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Loads state from a directory. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.language import Language
|
||||||
|
nlp = Language().from_disk('/path/to/models')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code path]
|
||||||
|
+cell unicode or #[code Path]
|
||||||
|
+cell
|
||||||
|
| A path to a directory. Paths may be either strings or
|
||||||
|
| #[code Path]-like objects.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **exclude]
|
||||||
|
+cell -
|
||||||
|
+cell Named attributes to prevent from being loaded.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell #[code Language]
|
||||||
|
+cell The modified #[code Language] object.
|
||||||
|
|
||||||
|
+h(2, "to_bytes") Language.to_bytes
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Serialize the current state to a binary string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
nlp_bytes = nlp.to_bytes()
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code **exclude]
|
||||||
|
+cell -
|
||||||
|
+cell Named attributes to prevent from being serialized.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell bytes
|
||||||
|
+cell The serialized form of the #[code Language] object.
|
||||||
|
|
||||||
|
+h(2, "from_bytes") Language.from_bytes
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Load state from a binary string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
fron spacy.lang.en import English
|
||||||
|
nlp_bytes = nlp.to_bytes()
|
||||||
|
nlp2 = English()
|
||||||
|
nlp2.from_bytes(nlp_bytes)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code bytes_data]
|
||||||
|
+cell bytes
|
||||||
|
+cell The data to load from.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **exclude]
|
||||||
|
+cell -
|
||||||
|
+cell Named attributes to prevent from being loaded.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell return
|
||||||
|
+cell bytes
|
||||||
|
+cell The serialized form of the #[code Language] object.
|
||||||
|
|
||||||
+h(2, "attributes") Attributes
|
+h(2, "attributes") Attributes
|
||||||
|
|
||||||
|
@ -46,109 +344,3 @@ p A text processing pipeline.
|
||||||
+cell #[code pipeline]
|
+cell #[code pipeline]
|
||||||
+cell -
|
+cell -
|
||||||
+cell Sequence of annotation functions.
|
+cell Sequence of annotation functions.
|
||||||
|
|
||||||
|
|
||||||
+h(2, "init") Language.__init__
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Create or load the pipeline.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code **overrides]
|
|
||||||
+cell -
|
|
||||||
+cell Keyword arguments indicating which defaults to override.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell #[code Language]
|
|
||||||
+cell The newly constructed object.
|
|
||||||
|
|
||||||
+h(2, "call") Language.__call__
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Apply the pipeline to a single text.
|
|
||||||
|
|
||||||
+aside-code("Example").
|
|
||||||
from spacy.en import English
|
|
||||||
nlp = English()
|
|
||||||
doc = nlp('An example sentence. Another example sentence.')
|
|
||||||
doc[0].orth_, doc[0].head.tag_
|
|
||||||
# ('An', 'NN')
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code text]
|
|
||||||
+cell unicode
|
|
||||||
+cell The text to be processed.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code tag]
|
|
||||||
+cell bool
|
|
||||||
+cell Whether to apply the part-of-speech tagger.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code parse]
|
|
||||||
+cell bool
|
|
||||||
+cell Whether to apply the syntactic dependency parser.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code entity]
|
|
||||||
+cell bool
|
|
||||||
+cell Whether to apply the named entity recognizer.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell #[code Doc]
|
|
||||||
+cell A container for accessing the linguistic annotations.
|
|
||||||
|
|
||||||
+h(2, "pipe") Language.pipe
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p
|
|
||||||
| Process texts as a stream, and yield #[code Doc] objects in order.
|
|
||||||
| Supports GIL-free multi-threading.
|
|
||||||
|
|
||||||
+aside-code("Example").
|
|
||||||
texts = [u'One document.', u'...', u'Lots of documents']
|
|
||||||
for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
|
||||||
assert doc.is_parsed
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code texts]
|
|
||||||
+cell -
|
|
||||||
+cell A sequence of unicode objects.
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code n_threads]
|
|
||||||
+cell int
|
|
||||||
+cell
|
|
||||||
| The number of worker threads to use. If #[code -1], OpenMP will
|
|
||||||
| decide how many to use at run time. Default is #[code 2].
|
|
||||||
|
|
||||||
+row
|
|
||||||
+cell #[code batch_size]
|
|
||||||
+cell int
|
|
||||||
+cell The number of texts to buffer.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell yield
|
|
||||||
+cell #[code Doc]
|
|
||||||
+cell Containers for accessing the linguistic annotations.
|
|
||||||
|
|
||||||
+h(2, "save_to_directory") Language.save_to_directory
|
|
||||||
+tag method
|
|
||||||
|
|
||||||
p Save the #[code Vocab], #[code StringStore] and pipeline to a directory.
|
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
|
||||||
+row
|
|
||||||
+cell #[code path]
|
|
||||||
+cell string or pathlib path
|
|
||||||
+cell Path to save the model.
|
|
||||||
|
|
||||||
+footrow
|
|
||||||
+cell return
|
|
||||||
+cell #[code None]
|
|
||||||
+cell -
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user