spaCy/website/docs/_api-language.jade

//- ----------------------------------
//- 💫 DOCS > API > LANGUAGE
//- ----------------------------------

+section("language")
    +h(2, "language", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
        | #[+tag class] Language

    p.
        A pipeline that transforms text strings into annotated spaCy Doc objects. Usually you'll load the Language pipeline once and pass the instance around your program.

    +code("python", "Overview").
        class Language:
            Defaults = BaseDefaults

            def __init__(self, path=True, **overrides):
                self.vocab = Vocab()
                self.tokenizer = Tokenizer()
                self.tagger = Tagger()
                self.parser = DependencyParser()
                self.entity = EntityRecognizer()
                self.make_doc = lambda text: Doc()
                self.pipeline = [self.tagger, self.parser, self.entity]

            def __call__(self, text, **toggle):
                doc = self.make_doc(text)
                for proc in self.pipeline:
                    if toggle.get(process.name, True):
                        process(doc)
                return doc

            def pipe(self, texts_iterator, batch_size=1000, n_threads=2, **toggle):
                docs = (self.make_doc(text) for text in texts_iterator)
                for process in self.pipeline:
                    if toggle.get(process.name, True):
                        docs = process.pipe(docs, batch_size=batch_size, n_threads=n_threads)
                for doc in self.docs:
                    yield doc

            def end_training(self, path=None):
                return None

            class English(Language):
                class Defaults(BaseDefaults):
                    pass

            class German(Language):
                class Defaults(BaseDefaults):
                    pass

    +section("english-init")
        +h(3, "english-init")
            | #[+tag method] Language.__init__

        p
            | Load the pipeline.  You can disable components by passing None as a value,
            | e.g. pass parser=None, vectors=None to save memory if you're not using
            | those components. You can also pass an object as the value.
            | Pass a function create_pipeline to use a custom pipeline --- see
            | the custom pipeline tutorial.

            +aside("Efficiency").
                Loading takes 10-20 seconds, and the instance consumes 2 to 3
                gigabytes of memory.  Intended use is for one instance to be
                created for each language per process, but you can create more
                if you're doing something unusual. You may wish to make the
                instance a global variable or "singleton".

        +table(["Example", "Description"])
            +row
                +cell #[code nlp = English()]
                +cell Load everything, from default path.

            +row
                +cell #[code nlp = English(path='my_data')]
                +cell Load everything, from specified path

            +row
                +cell #[code nlp = English(path=path_obj)]
                +cell Load everything, from an object that follows the #[code pathlib.Path] protocol.

            +row
                +cell #[code nlp = English(parser=False, vectors=False)]
                +cell Load everything except the parser and the word vectors.

            +row
                +cell #[code nlp = English(parser=my_parser)]
                +cell Load everything, and use a custom parser.

            +row
                +cell #[code nlp = English(create_pipeline=my_pipeline)]
                +cell Load everything, and use a custom pipeline.

        +code("python", "Definition").
            def __init__(self, path=True, **overrides):
                D = self.Defaults
                self.vocab     = Vocab(path=path, parent=self, **D.vocab) \
                                 if 'vocab' not in overrides \
                                 else overrides['vocab']
                self.tokenizer = Tokenizer(self.vocab, path=path, **D.tokenizer) \
                                 if 'tokenizer' not in overrides \
                                 else overrides['tokenizer']
                self.tagger    = Tagger(self.vocab, path=path, **D.tagger) \
                                 if 'tagger' not in overrides \
                                 else overrides['tagger']
                self.parser    = DependencyParser(self.vocab, path=path, **D.parser) \
                                 if 'parser' not in overrides \
                                 else overrides['parser']
                self.entity    = EntityRecognizer(self.vocab, path=path, **D.entity) \
                                 if 'entity' not in overrides \
                                 else overrides['entity']
                self.matcher   = Matcher(self.vocab, path=path, **D.matcher) \
                                 if 'matcher' not in overrides \
                                 else overrides['matcher']

                if 'make_doc' in overrides:
                    self.make_doc = overrides['make_doc']
                elif 'create_make_doc' in overrides:
                    self.make_doc = overrides['create_make_doc'](self)
                else:
                    self.make_doc = lambda text: self.tokenizer(text)
                if 'pipeline' in overrides:
                    self.pipeline = overrides['pipeline']
                elif 'create_pipeline' in overrides:
                    self.pipeline = overrides['create_pipeline'](self)
                else:
                    self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]

        +section("language-call")
            +h(3, "language-call")
                | #[+tag method] Language.__call__

            p
                | The main entry point to spaCy. Takes raw unicode text, and returns
                | a #[code Doc] object, which can be iterated to access #[code Token]
                | and #[code Span] objects.

                +aside("Efficiency").
                    spaCy's algorithms are all linear-time, so you can supply
                    documents of arbitrary length, e.g. whole novels.

            +table(["Example", "Description"], "code")
                +row
                    +cell #[ doc = nlp(u'Some text.')]
                    +cell Apply the full pipeline.
                +row
                    +cell #[ doc = nlp(u'Some text.', parse=False)]
                    +cell Applies tagger and entity, not parser
                +row
                    +cell #[ doc = nlp(u'Some text.', entity=False)]
                    +cell Applies tagger and parser, not entity.
                +row
                    +cell #[ doc = nlp(u'Some text.', tag=False)]
                    +cell Does not apply tagger, entity or parser
                +row
                    +cell #[ doc = nlp(u'')]
                    +cell Zero-length tokens, not an error
                +row
                    +cell #[ doc = nlp(b'Some text')]
                    +cell Error: need unicode
                +row
                    +cell #[ doc = nlp(b'Some text'.decode('utf8'))]
                    +cell Decode bytes into unicode first.

            +code("python", "Definition").
                def __call__(self, text, tag=True, parse=True, entity=True, matcher=True):
                    return self

            +table(["Name", "Type", "Description"])
                +row
                    +cell text
                    +cell #[+a(link_unicode) unicode]
                    +cell.
                        The text to be processed. spaCy expects raw unicode text
                        – you don"t necessarily need to, say, split it into paragraphs.
                        However, depending on your documents, you might be better
                        off applying custom pre-processing. Non-text formatting,
                        e.g. from HTML mark-up, should be removed before sending
                        the document to spaCy. If your documents have a consistent
                        format, you may be able to improve accuracy by pre-processing.
                        For instance, if the first word of your documents are always
                        in upper-case, it may be helpful to normalize them before
                        supplying them to spaCy.

                +row
                    +cell tag
                    +cell #[+a(link_bool) bool]
                    +cell.
                        Whether to apply the part-of-speech tagger. Required for
                        parsing and entity recognition.

                +row
                    +cell parse
                    +cell #[+a(link_bool) bool]
                    +cell.
                        Whether to apply the syntactic dependency parser.

                +row
                    +cell entity
                    +cell #[+a(link_bool) bool]
                    +cell.
                        Whether to apply the named entity recognizer.

        +section("english-pipe")
            +h(3, "english-pipe")
                | #[+tag method] English.pipe

            p
                | Parse a sequence of texts into a sequence of #[code Doc] objects.
                | Accepts a generator as input, and produces a generator as output.
                | Internally, it accumulates a buffer of #[code batch_size]
                | texts, works on them with #[code n_threads] workers in parallel,
                | and then yields the #[code Doc] objects one by one.

                +aside("Efficiency").
                    spaCy releases the global interpreter lock around the parser and
                    named entity recognizer, allowing shared-memory parallelism via
                    OpenMP. However, OpenMP is not supported on OSX — so multiple
                    threads will only be used on Linux and Windows.

            +table(["Example", "Description"], "usage")
                +row
                    +cell #[+a("https://github.com/" + SOCIAL.github + "/spaCy/blob/master/examples/parallel_parse.py") parallel_parse.py]
                    +cell Parse comments from Reddit in parallel.

            +code("python", "Definition").
                def pipe(self, texts, n_threads=2, batch_size=1000):
                    yield Doc()

            +table(["Arg", "Type", "Description"])
                +row
                    +cell texts
                    +cell
                    +cell.
                        A sequence of unicode objects. Usually you will want this
                        to be a generator, so that you don"t need to have all of
                        your texts in memory.

                +row
                    +cell n_threads
                    +cell #[+a(link_int) int]
                    +cell.
                        The number of worker threads to use. If -1, OpenMP will
                        decide how many to use at run time. Default is 2.

                +row
                    +cell batch_size
                    +cell #[+a(link_int) int]
                    +cell.
                        The number of texts to buffer. Let"s say you have a
                        #[code batch_size] of 1,000. The input, #[code texts], is
                        a generator that yields the texts one-by-one. We want to
                        operate on them in parallel. So, we accumulate a work queue.
                        Instead of taking one document from #[code texts] and
                        operating on it, we buffer #[code batch_size] documents,
                        work on them in parallel, and then yield them one-by-one.
                        Higher #[code batch_size] therefore often results in better
                        parallelism, up to a point.