spaCy/website/docs/_api-language.jade
2016-10-21 01:00:34 +02:00

259 lines
12 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//- ----------------------------------
//- 💫 DOCS > API > LANGUAGE
//- ----------------------------------
+section("language")
+h(2, "language", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
| #[+tag class] Language
p.
A pipeline that transforms text strings into annotated spaCy Doc objects. Usually you'll load the Language pipeline once and pass the instance around your program.
+code("python", "Overview").
class Language:
Defaults = BaseDefaults
def __init__(self, path=True, **overrides):
self.vocab = Vocab()
self.tokenizer = Tokenizer()
self.tagger = Tagger()
self.parser = DependencyParser()
self.entity = EntityRecognizer()
self.make_doc = lambda text: Doc()
self.pipeline = [self.tagger, self.parser, self.entity]
def __call__(self, text, **toggle):
doc = self.make_doc(text)
for proc in self.pipeline:
if toggle.get(process.name, True):
process(doc)
return doc
def pipe(self, texts_iterator, batch_size=1000, n_threads=2, **toggle):
docs = (self.make_doc(text) for text in texts_iterator)
for process in self.pipeline:
if toggle.get(process.name, True):
docs = process.pipe(docs, batch_size=batch_size, n_threads=n_threads)
for doc in self.docs:
yield doc
def end_training(self, path=None):
return None
class English(Language):
class Defaults(BaseDefaults):
pass
class German(Language):
class Defaults(BaseDefaults):
pass
+section("english-init")
+h(3, "english-init")
| #[+tag method] Language.__init__
p
| Load the pipeline. You can disable components by passing None as a value,
| e.g. pass parser=None, vectors=None to save memory if you're not using
| those components. You can also pass an object as the value.
| Pass a function create_pipeline to use a custom pipeline --- see
| the custom pipeline tutorial.
+aside("Efficiency").
Loading takes 10-20 seconds, and the instance consumes 2 to 3
gigabytes of memory. Intended use is for one instance to be
created for each language per process, but you can create more
if you're doing something unusual. You may wish to make the
instance a global variable or "singleton".
+table(["Example", "Description"])
+row
+cell #[code nlp = English()]
+cell Load everything, from default path.
+row
+cell #[code nlp = English(path='my_data')]
+cell Load everything, from specified path
+row
+cell #[code nlp = English(path=path_obj)]
+cell Load everything, from an object that follows the #[code pathlib.Path] protocol.
+row
+cell #[code nlp = English(parser=False, vectors=False)]
+cell Load everything except the parser and the word vectors.
+row
+cell #[code nlp = English(parser=my_parser)]
+cell Load everything, and use a custom parser.
+row
+cell #[code nlp = English(create_pipeline=my_pipeline)]
+cell Load everything, and use a custom pipeline.
+code("python", "Definition").
def __init__(self, path=True, **overrides):
D = self.Defaults
self.vocab = Vocab(path=path, parent=self, **D.vocab) \
if 'vocab' not in overrides \
else overrides['vocab']
self.tokenizer = Tokenizer(self.vocab, path=path, **D.tokenizer) \
if 'tokenizer' not in overrides \
else overrides['tokenizer']
self.tagger = Tagger(self.vocab, path=path, **D.tagger) \
if 'tagger' not in overrides \
else overrides['tagger']
self.parser = DependencyParser(self.vocab, path=path, **D.parser) \
if 'parser' not in overrides \
else overrides['parser']
self.entity = EntityRecognizer(self.vocab, path=path, **D.entity) \
if 'entity' not in overrides \
else overrides['entity']
self.matcher = Matcher(self.vocab, path=path, **D.matcher) \
if 'matcher' not in overrides \
else overrides['matcher']
if 'make_doc' in overrides:
self.make_doc = overrides['make_doc']
elif 'create_make_doc' in overrides:
self.make_doc = overrides['create_make_doc'](self)
else:
self.make_doc = lambda text: self.tokenizer(text)
if 'pipeline' in overrides:
self.pipeline = overrides['pipeline']
elif 'create_pipeline' in overrides:
self.pipeline = overrides['create_pipeline'](self)
else:
self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
+section("language-call")
+h(3, "language-call")
| #[+tag method] Language.__call__
p
| The main entry point to spaCy. Takes raw unicode text, and returns
| a #[code Doc] object, which can be iterated to access #[code Token]
| and #[code Span] objects.
+aside("Efficiency").
spaCy's algorithms are all linear-time, so you can supply
documents of arbitrary length, e.g. whole novels.
+table(["Example", "Description"], "code")
+row
+cell #[ doc = nlp(u'Some text.')]
+cell Apply the full pipeline.
+row
+cell #[ doc = nlp(u'Some text.', parse=False)]
+cell Applies tagger and entity, not parser
+row
+cell #[ doc = nlp(u'Some text.', entity=False)]
+cell Applies tagger and parser, not entity.
+row
+cell #[ doc = nlp(u'Some text.', tag=False)]
+cell Does not apply tagger, entity or parser
+row
+cell #[ doc = nlp(u'')]
+cell Zero-length tokens, not an error
+row
+cell #[ doc = nlp(b'Some text')]
+cell Error: need unicode
+row
+cell #[ doc = nlp(b'Some text'.decode('utf8'))]
+cell Decode bytes into unicode first.
+code("python", "Definition").
def __call__(self, text, tag=True, parse=True, entity=True, matcher=True):
return self
+table(["Name", "Type", "Description"])
+row
+cell text
+cell #[+a(link_unicode) unicode]
+cell.
The text to be processed. spaCy expects raw unicode text
you don"t necessarily need to, say, split it into paragraphs.
However, depending on your documents, you might be better
off applying custom pre-processing. Non-text formatting,
e.g. from HTML mark-up, should be removed before sending
the document to spaCy. If your documents have a consistent
format, you may be able to improve accuracy by pre-processing.
For instance, if the first word of your documents are always
in upper-case, it may be helpful to normalize them before
supplying them to spaCy.
+row
+cell tag
+cell #[+a(link_bool) bool]
+cell.
Whether to apply the part-of-speech tagger. Required for
parsing and entity recognition.
+row
+cell parse
+cell #[+a(link_bool) bool]
+cell.
Whether to apply the syntactic dependency parser.
+row
+cell entity
+cell #[+a(link_bool) bool]
+cell.
Whether to apply the named entity recognizer.
+section("english-pipe")
+h(3, "english-pipe")
| #[+tag method] English.pipe
p
| Parse a sequence of texts into a sequence of #[code Doc] objects.
| Accepts a generator as input, and produces a generator as output.
| Internally, it accumulates a buffer of #[code batch_size]
| texts, works on them with #[code n_threads] workers in parallel,
| and then yields the #[code Doc] objects one by one.
+aside("Efficiency").
spaCy releases the global interpreter lock around the parser and
named entity recognizer, allowing shared-memory parallelism via
OpenMP. However, OpenMP is not supported on OSX — so multiple
threads will only be used on Linux and Windows.
+table(["Example", "Description"], "usage")
+row
+cell #[+a("https://github.com/" + SOCIAL.github + "/spaCy/blob/master/examples/parallel_parse.py") parallel_parse.py]
+cell Parse comments from Reddit in parallel.
+code("python", "Definition").
def pipe(self, texts, n_threads=2, batch_size=1000):
yield Doc()
+table(["Arg", "Type", "Description"])
+row
+cell texts
+cell
+cell.
A sequence of unicode objects. Usually you will want this
to be a generator, so that you don"t need to have all of
your texts in memory.
+row
+cell n_threads
+cell #[+a(link_int) int]
+cell.
The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
+row
+cell batch_size
+cell #[+a(link_int) int]
+cell.
The number of texts to buffer. Let"s say you have a
#[code batch_size] of 1,000. The input, #[code texts], is
a generator that yields the texts one-by-one. We want to
operate on them in parallel. So, we accumulate a work queue.
Instead of taking one document from #[code texts] and
operating on it, we buffer #[code batch_size] documents,
work on them in parallel, and then yield them one-by-one.
Higher #[code batch_size] therefore often results in better
parallelism, up to a point.