mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-30 20:06:30 +03:00
391 lines
10 KiB
Plaintext
391 lines
10 KiB
Plaintext
|
//- 💫 DOCS > API > PIPE
|
||
|
|
||
|
include ../_includes/_mixins
|
||
|
|
||
|
//- This page can be used as a template for all other classes that inherit
|
||
|
//- from `Pipe`.
|
||
|
|
||
|
if subclass
|
||
|
+infobox
|
||
|
| This class is a subclass of #[+api("pipe") #[code Pipe]] and
|
||
|
| follows the same API. The pipeline component is available in the
|
||
|
| #[+a("/usage/processing-pipelines") processing pipeline] via the ID
|
||
|
| #[code "#{pipeline_id}"].
|
||
|
|
||
|
else
|
||
|
p
|
||
|
| This class is not instantiated directly. Components inherit from it,
|
||
|
| and it defines the interface that components should follow to
|
||
|
| function as components in a spaCy analysis pipeline.
|
||
|
|
||
|
- CLASSNAME = subclass || 'Pipe'
|
||
|
- VARNAME = short || CLASSNAME.toLowerCase()
|
||
|
|
||
|
|
||
|
+h(2, "model") #{CLASSNAME}.Model
|
||
|
+tag classmethod
|
||
|
|
||
|
p
|
||
|
| Initialise a model for the pipe. The model should implement the
|
||
|
| #[code thinc.neural.Model] API. Wrappers are available for
|
||
|
| #[+a("/usage/deep-learning") most major machine learning libraries].
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code **kwargs]
|
||
|
+cell -
|
||
|
+cell Parameters for initialising the model
|
||
|
|
||
|
+row("foot")
|
||
|
+cell returns
|
||
|
+cell object
|
||
|
+cell The initialised model.
|
||
|
|
||
|
+h(2, "init") #{CLASSNAME}.__init__
|
||
|
+tag method
|
||
|
|
||
|
p Create a new pipeline instance.
|
||
|
|
||
|
+aside-code("Example").
|
||
|
from spacy.pipeline import #{CLASSNAME}
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code vocab]
|
||
|
+cell #[code Vocab]
|
||
|
+cell The shared vocabulary.
|
||
|
|
||
|
+row
|
||
|
+cell #[code model]
|
||
|
+cell #[code thinc.neural.Model] or #[code True]
|
||
|
+cell
|
||
|
| The model powering the pipeline component. If no model is
|
||
|
| supplied, the model is created when you call
|
||
|
| #[code begin_training], #[code from_disk] or #[code from_bytes].
|
||
|
|
||
|
+row
|
||
|
+cell #[code **cfg]
|
||
|
+cell -
|
||
|
+cell Configuration parameters.
|
||
|
|
||
|
+row("foot")
|
||
|
+cell returns
|
||
|
+cell #[code=CLASSNAME]
|
||
|
+cell The newly constructed object.
|
||
|
|
||
|
+h(2, "call") #{CLASSNAME}.__call__
|
||
|
+tag method
|
||
|
|
||
|
p
|
||
|
| Apply the pipe to one document. The document is modified in place, and
|
||
|
| returned. Both #[code #{CLASSNAME}.__call__] and
|
||
|
| #[code #{CLASSNAME}.pipe] should delegate to the
|
||
|
| #[code #{CLASSNAME}.predict] and #[code #{CLASSNAME}.set_annotations]
|
||
|
| methods.
|
||
|
|
||
|
+aside-code("Example").
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
doc = nlp(u"This is a sentence.")
|
||
|
processed = #{VARNAME}(doc)
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code doc]
|
||
|
+cell #[code Doc]
|
||
|
+cell The document to process.
|
||
|
|
||
|
+row("foot")
|
||
|
+cell returns
|
||
|
+cell #[code Doc]
|
||
|
+cell The processed document.
|
||
|
|
||
|
+h(2, "pipe") #{CLASSNAME}.pipe
|
||
|
+tag method
|
||
|
|
||
|
p
|
||
|
| Apply the pipe to a stream of documents. Both
|
||
|
| #[code #{CLASSNAME}.__call__] and #[code #{CLASSNAME}.pipe] should
|
||
|
| delegate to the #[code #{CLASSNAME}.predict] and
|
||
|
| #[code #{CLASSNAME}.set_annotations] methods.
|
||
|
|
||
|
+aside-code("Example").
|
||
|
texts = [u'One doc', u'...', u'Lots of docs']
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
for doc in #{VARNAME}.pipe(texts, batch_size=50):
|
||
|
pass
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code stream]
|
||
|
+cell iterable
|
||
|
+cell A stream of documents.
|
||
|
|
||
|
+row
|
||
|
+cell #[code batch_size]
|
||
|
+cell int
|
||
|
+cell The number of texts to buffer. Defaults to #[code 128].
|
||
|
|
||
|
+row
|
||
|
+cell #[code n_threads]
|
||
|
+cell int
|
||
|
+cell
|
||
|
| The number of worker threads to use. If #[code -1], OpenMP will
|
||
|
| decide how many to use at run time. Default is #[code -1].
|
||
|
|
||
|
+row("foot")
|
||
|
+cell yields
|
||
|
+cell #[code Doc]
|
||
|
+cell Processed documents in the order of the original text.
|
||
|
|
||
|
+h(2, "predict") #{CLASSNAME}.predict
|
||
|
+tag method
|
||
|
|
||
|
p
|
||
|
| Apply the pipeline's model to a batch of docs, without modifying them.
|
||
|
|
||
|
+aside-code("Example").
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
scores = #{VARNAME}.predict([doc1, doc2])
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code docs]
|
||
|
+cell iterable
|
||
|
+cell The documents to predict.
|
||
|
|
||
|
+row("foot")
|
||
|
+cell returns
|
||
|
+cell -
|
||
|
+cell Scores from the model.
|
||
|
|
||
|
+h(2, "set_annotations") #{CLASSNAME}.set_annotations
|
||
|
+tag method
|
||
|
|
||
|
p
|
||
|
| Modify a batch of documents, using pre-computed scores.
|
||
|
|
||
|
+aside-code("Example").
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
scores = #{VARNAME}.predict([doc1, doc2])
|
||
|
#{VARNAME}.set_annotations([doc1, doc2], scores)
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code docs]
|
||
|
+cell iterable
|
||
|
+cell The documents to modify.
|
||
|
|
||
|
+row
|
||
|
+cell #[code scores]
|
||
|
+cell -
|
||
|
+cell The scores to set, produced by #[code #{CLASSNAME}.predict].
|
||
|
|
||
|
+h(2, "update") #{CLASSNAME}.update
|
||
|
+tag method
|
||
|
|
||
|
p
|
||
|
| Learn from a batch of documents and gold-standard information, updating
|
||
|
| the pipe's model. Delegates to #[code #{CLASSNAME}.predict] and
|
||
|
| #[code #{CLASSNAME}.get_loss].
|
||
|
|
||
|
+aside-code("Example").
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
losses = {}
|
||
|
optimizer = nlp.begin_training()
|
||
|
#{VARNAME}.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code docs]
|
||
|
+cell iterable
|
||
|
+cell A batch of documents to learn from.
|
||
|
|
||
|
+row
|
||
|
+cell #[code golds]
|
||
|
+cell iterable
|
||
|
+cell The gold-standard data. Must have the same length as #[code docs].
|
||
|
|
||
|
+row
|
||
|
+cell #[code drop]
|
||
|
+cell int
|
||
|
+cell The dropout rate.
|
||
|
|
||
|
+row
|
||
|
+cell #[code sgd]
|
||
|
+cell callable
|
||
|
+cell
|
||
|
| The optimizer. Should take two arguments #[code weights] and
|
||
|
| #[code gradient], and an optional ID.
|
||
|
|
||
|
+row
|
||
|
+cell #[code losses]
|
||
|
+cell dict
|
||
|
+cell
|
||
|
| Optional record of the loss during training. The value keyed by
|
||
|
| the model's name is updated.
|
||
|
|
||
|
+h(2, "get_loss") #{CLASSNAME}.get_loss
|
||
|
+tag method
|
||
|
|
||
|
p
|
||
|
| Find the loss and gradient of loss for the batch of documents and their
|
||
|
| predicted scores.
|
||
|
|
||
|
+aside-code("Example").
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
scores = #{VARNAME}.predict([doc1, doc2])
|
||
|
loss, d_loss = #{VARNAME}.get_loss([doc1, doc2], [gold1, gold2], scores)
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code docs]
|
||
|
+cell iterable
|
||
|
+cell The batch of documents.
|
||
|
|
||
|
+row
|
||
|
+cell #[code golds]
|
||
|
+cell iterable
|
||
|
+cell The gold-standard data. Must have the same length as #[code docs].
|
||
|
|
||
|
+row
|
||
|
+cell #[code scores]
|
||
|
+cell -
|
||
|
+cell Scores representing the model's predictions.
|
||
|
|
||
|
+row("foot")
|
||
|
+cell returns
|
||
|
+cell tuple
|
||
|
+cell The loss and the gradient, i.e. #[code (loss, gradient)].
|
||
|
|
||
|
+h(2, "begin_training") #{CLASSNAME}.begin_training
|
||
|
+tag method
|
||
|
|
||
|
p
|
||
|
| Initialize the pipe for training, using data exampes if available. If no
|
||
|
| model has been initialized yet, the model is added.
|
||
|
|
||
|
+aside-code("Example").
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
nlp.pipeline.append(#{VARNAME})
|
||
|
#{VARNAME}.begin_training(pipeline=nlp.pipeline)
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code gold_tuples]
|
||
|
+cell iterable
|
||
|
+cell
|
||
|
| Optional gold-standard annotations from which to construct
|
||
|
| #[+api("goldparse") #[code GoldParse]] objects.
|
||
|
|
||
|
+row
|
||
|
+cell #[code pipeline]
|
||
|
+cell list
|
||
|
+cell
|
||
|
| Optional list of #[+api("pipe") #[code Pipe]] components that
|
||
|
| this component is part of.
|
||
|
|
||
|
+h(2, "use_params") #{CLASSNAME}.use_params
|
||
|
+tag method
|
||
|
+tag contextmanager
|
||
|
|
||
|
p Modify the pipe's model, to use the given parameter values.
|
||
|
|
||
|
+aside-code("Example").
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
with #{VARNAME}.use_params():
|
||
|
#{VARNAME}.to_disk('/best_model')
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code params]
|
||
|
+cell -
|
||
|
+cell
|
||
|
| The parameter values to use in the model. At the end of the
|
||
|
| context, the original parameters are restored.
|
||
|
|
||
|
+h(2, "to_disk") #{CLASSNAME}.to_disk
|
||
|
+tag method
|
||
|
|
||
|
p Serialize the pipe to disk.
|
||
|
|
||
|
+aside-code("Example").
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
#{VARNAME}.to_disk('/path/to/#{VARNAME}')
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code path]
|
||
|
+cell unicode or #[code Path]
|
||
|
+cell
|
||
|
| A path to a directory, which will be created if it doesn't exist.
|
||
|
| Paths may be either strings or #[code Path]-like objects.
|
||
|
|
||
|
+h(2, "from_disk") #{CLASSNAME}.from_disk
|
||
|
+tag method
|
||
|
|
||
|
p Load the pipe from disk. Modifies the object in place and returns it.
|
||
|
|
||
|
+aside-code("Example").
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
#{VARNAME}.from_disk('/path/to/#{VARNAME}')
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code path]
|
||
|
+cell unicode or #[code Path]
|
||
|
+cell
|
||
|
| A path to a directory. Paths may be either strings or
|
||
|
| #[code Path]-like objects.
|
||
|
|
||
|
+row("foot")
|
||
|
+cell returns
|
||
|
+cell #[code=CLASSNAME]
|
||
|
+cell The modified #[code=CLASSNAME] object.
|
||
|
|
||
|
+h(2, "to_bytes") #{CLASSNAME}.to_bytes
|
||
|
+tag method
|
||
|
|
||
|
+aside-code("example").
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
#{VARNAME}_bytes = #{VARNAME}.to_bytes()
|
||
|
|
||
|
p Serialize the pipe to a bytestring.
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code **exclude]
|
||
|
+cell -
|
||
|
+cell Named attributes to prevent from being serialized.
|
||
|
|
||
|
+row("foot")
|
||
|
+cell returns
|
||
|
+cell bytes
|
||
|
+cell The serialized form of the #[code=CLASSNAME] object.
|
||
|
|
||
|
+h(2, "from_bytes") #{CLASSNAME}.from_bytes
|
||
|
+tag method
|
||
|
|
||
|
p Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||
|
|
||
|
+aside-code("Example").
|
||
|
#{VARNAME}_bytes = #{VARNAME}.to_bytes()
|
||
|
#{VARNAME} = #{CLASSNAME}(nlp.vocab)
|
||
|
#{VARNAME}.from_bytes(#{VARNAME}_bytes)
|
||
|
|
||
|
+table(["Name", "Type", "Description"])
|
||
|
+row
|
||
|
+cell #[code bytes_data]
|
||
|
+cell bytes
|
||
|
+cell The data to load from.
|
||
|
|
||
|
+row
|
||
|
+cell #[code **exclude]
|
||
|
+cell -
|
||
|
+cell Named attributes to prevent from being loaded.
|
||
|
|
||
|
+row("foot")
|
||
|
+cell returns
|
||
|
+cell #[code=CLASSNAME]
|
||
|
+cell The #[code=CLASSNAME] object.
|