spaCy/website/docs/_api-language.jade

//- ----------------------------------
//- 💫 DOCS > API > LANGUAGE
//- ----------------------------------

+section("language")
    +h(2, "language", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
        | #[+tag class] Language

    p.
        A pipeline that transforms text strings into annotated spaCy Doc objects. Usually you'll load the Language pipeline once and pass the instance around your program.

    +code("python", "Overview").
        class Language:
            Defaults = BaseDefaults

            def __init__(self, path=True, **overrides):
                self.vocab = Vocab()
                self.tokenizer = Tokenizer()
                self.tagger = Tagger()
                self.parser = DependencyParser()
                self.entity = EntityRecognizer()
                self.make_doc = lambda text: Doc()
                self.pipeline = [self.tagger, self.parser, self.entity]

            def __call__(self, text, **toggle):
                doc = self.make_doc(text)
                for proc in self.pipeline:
                    if toggle.get(process.name, True):
                        process(doc)
                return doc

            def pipe(self, texts_iterator, batch_size=1000, n_threads=2, **toggle):
                docs = (self.make_doc(text) for text in texts_iterator)
                for process in self.pipeline:
                    if toggle.get(process.name, True):
                        docs = process.pipe(docs, batch_size=batch_size, n_threads=n_threads)
                for doc in self.docs:
                    yield doc

            def end_training(self, path=None):
                return None

            class English(Language):
                class Defaults(BaseDefaults):
                    pass

            class German(Language):
                class Defaults(BaseDefaults):
                    pass

    +section("english-init")
        +h(3, "english-init")
            | #[+tag method] Language.__init__

        p
            | Load the pipeline.  You can disable components by passing None as a value,
            | e.g. pass parser=None, vectors=None to save memory if you're not using
            | those components. You can also pass an object as the value.
            | Pass a function create_pipeline to use a custom pipeline --- see
            | the custom pipeline tutorial.

            +aside("Efficiency").
                Loading takes 10-20 seconds, and the instance consumes 2 to 3
                gigabytes of memory.  Intended use is for one instance to be
                created for each language per process, but you can create more
                if you're doing something unusual. You may wish to make the
                instance a global variable or "singleton".

        +table(["Example", "Description"])
            +row
                +cell #[code nlp = English()]
                +cell Load everything, from default path.

            +row
                +cell #[code nlp = English(path='my_data')]
                +cell Load everything, from specified path

            +row
                +cell #[code nlp = English(path=path_obj)]
                +cell Load everything, from an object that follows the #[code pathlib.Path] protocol.

            +row
                +cell #[code nlp = English(parser=False, vectors=False)]
                +cell Load everything except the parser and the word vectors.

            +row
                +cell #[code nlp = English(parser=my_parser)]
                +cell Load everything, and use a custom parser.

            +row
                +cell #[code nlp = English(create_pipeline=my_pipeline)]
                +cell Load everything, and use a custom pipeline.

        +code("python", "Definition").
            def __init__(self, path=True, **overrides):
                D = self.Defaults
                self.vocab     = Vocab(path=path, parent=self, **D.vocab) \
                                 if 'vocab' not in overrides \
                                 else overrides['vocab']
                self.tokenizer = Tokenizer(self.vocab, path=path, **D.tokenizer) \
                                 if 'tokenizer' not in overrides \
                                 else overrides['tokenizer']
                self.tagger    = Tagger(self.vocab, path=path, **D.tagger) \
                                 if 'tagger' not in overrides \
                                 else overrides['tagger']
                self.parser    = DependencyParser(self.vocab, path=path, **D.parser) \
                                 if 'parser' not in overrides \
                                 else overrides['parser']
                self.entity    = EntityRecognizer(self.vocab, path=path, **D.entity) \
                                 if 'entity' not in overrides \
                                 else overrides['entity']
                self.matcher   = Matcher(self.vocab, path=path, **D.matcher) \
                                 if 'matcher' not in overrides \
                                 else overrides['matcher']

                if 'make_doc' in overrides:
                    self.make_doc = overrides['make_doc']
                elif 'create_make_doc' in overrides:
                    self.make_doc = overrides['create_make_doc'](self)
                else:
                    self.make_doc = lambda text: self.tokenizer(text)
                if 'pipeline' in overrides:
                    self.pipeline = overrides['pipeline']
                elif 'create_pipeline' in overrides:
                    self.pipeline = overrides['create_pipeline'](self)
                else:
                    self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]

        +section("language-call")
            +h(3, "language-call")
                | #[+tag method] Language.__call__

            p
                | The main entry point to spaCy. Takes raw unicode text, and returns
                | a #[code Doc] object, which can be iterated to access #[code Token]
                | and #[code Span] objects.

                +aside("Efficiency").
                    spaCy's algorithms are all linear-time, so you can supply
                    documents of arbitrary length, e.g. whole novels.

            +table(["Example", "Description"], "code")
                +row
                    +cell #[ doc = nlp(u'Some text.')]
                    +cell Apply the full pipeline.
                +row
                    +cell #[ doc = nlp(u'Some text.', parse=False)]
                    +cell Applies tagger and entity, not parser
                +row
                    +cell #[ doc = nlp(u'Some text.', entity=False)]
                    +cell Applies tagger and parser, not entity.
                +row
                    +cell #[ doc = nlp(u'Some text.', tag=False)]
                    +cell Does not apply tagger, entity or parser
                +row
                    +cell #[ doc = nlp(u'')]
                    +cell Zero-length tokens, not an error
                +row
                    +cell #[ doc = nlp(b'Some text')]
                    +cell Error: need unicode
                +row
                    +cell #[ doc = nlp(b'Some text'.decode('utf8'))]
                    +cell Decode bytes into unicode first.

            +code("python", "Definition").
                def __call__(self, text, tag=True, parse=True, entity=True, matcher=True):
                    return self

            +table(["Name", "Type", "Description"])
                +row
                    +cell text
                    +cell #[+a(link_unicode) unicode]
                    +cell.
                        The text to be processed. spaCy expects raw unicode text
                        – you don"t necessarily need to, say, split it into paragraphs.
                        However, depending on your documents, you might be better
                        off applying custom pre-processing. Non-text formatting,
                        e.g. from HTML mark-up, should be removed before sending
                        the document to spaCy. If your documents have a consistent
                        format, you may be able to improve accuracy by pre-processing.
                        For instance, if the first word of your documents are always
                        in upper-case, it may be helpful to normalize them before
                        supplying them to spaCy.

                +row
                    +cell tag
                    +cell #[+a(link_bool) bool]
                    +cell.
                        Whether to apply the part-of-speech tagger. Required for
                        parsing and entity recognition.

                +row
                    +cell parse
                    +cell #[+a(link_bool) bool]
                    +cell.
                        Whether to apply the syntactic dependency parser.

                +row
                    +cell entity
                    +cell #[+a(link_bool) bool]
                    +cell.
                        Whether to apply the named entity recognizer.

        +section("english-pipe")
            +h(3, "english-pipe")
                | #[+tag method] English.pipe

            p
                | Parse a sequence of texts into a sequence of #[code Doc] objects.
                | Accepts a generator as input, and produces a generator as output.
                | Internally, it accumulates a buffer of #[code batch_size]
                | texts, works on them with #[code n_threads] workers in parallel,
                | and then yields the #[code Doc] objects one by one.

                +aside("Efficiency").
                    spaCy releases the global interpreter lock around the parser and
                    named entity recognizer, allowing shared-memory parallelism via
                    OpenMP. However, OpenMP is not supported on OSX — so multiple
                    threads will only be used on Linux and Windows.

            +table(["Example", "Description"], "usage")
                +row
                    +cell #[+a("https://github.com/" + SOCIAL.github + "/spaCy/blob/master/examples/parallel_parse.py") parallel_parse.py]
                    +cell Parse comments from Reddit in parallel.

            +code("python", "Definition").
                def pipe(self, texts, n_threads=2, batch_size=1000):
                    yield Doc()

            +table(["Arg", "Type", "Description"])
                +row
                    +cell texts
                    +cell
                    +cell.
                        A sequence of unicode objects. Usually you will want this
                        to be a generator, so that you don"t need to have all of
                        your texts in memory.

                +row
                    +cell n_threads
                    +cell #[+a(link_int) int]
                    +cell.
                        The number of worker threads to use. If -1, OpenMP will
                        decide how many to use at run time. Default is 2.

                +row
                    +cell batch_size
                    +cell #[+a(link_int) int]
                    +cell.
                        The number of texts to buffer. Let"s say you have a
                        #[code batch_size] of 1,000. The input, #[code texts], is
                        a generator that yields the texts one-by-one. We want to
                        operate on them in parallel. So, we accumulate a work queue.
                        Instead of taking one document from #[code texts] and
                        operating on it, we buffer #[code batch_size] documents,
                        work on them in parallel, and then yield them one-by-one.
                        Higher #[code batch_size] therefore often results in better
                        parallelism, up to a point.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								//- ----------------------------------
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								//- 💫 DOCS > API > LANGUAGE
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								//- ----------------------------------
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								+section("language")
 								    +h(2, "language", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
 								        | #[+tag class] Language
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								    p.
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								        A pipeline that transforms text strings into annotated spaCy Doc objects. Usually you'll load the Language pipeline once and pass the instance around your program.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
 								    +code("python", "Overview").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								        class Language:
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								            Defaults = BaseDefaults
 								            def __init__(self, path=True, **overrides):
 								                self.vocab = Vocab()
 								                self.tokenizer = Tokenizer()
 								                self.tagger = Tagger()
 								                self.parser = DependencyParser()
 								                self.entity = EntityRecognizer()
 								                self.make_doc = lambda text: Doc()
 								                self.pipeline = [self.tagger, self.parser, self.entity]
 								            def __call__(self, text, **toggle):
 								                doc = self.make_doc(text)
 								                for proc in self.pipeline:
 								                    if toggle.get(process.name, True):
 								                        process(doc)
 								                return doc
 								            def pipe(self, texts_iterator, batch_size=1000, n_threads=2, **toggle):
 								                docs = (self.make_doc(text) for text in texts_iterator)
 								                for process in self.pipeline:
 								                    if toggle.get(process.name, True):
 								                        docs = process.pipe(docs, batch_size=batch_size, n_threads=n_threads)
 								                for doc in self.docs:
 								                    yield doc
 								            def end_training(self, path=None):
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                return None
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								            class English(Language):
 								                class Defaults(BaseDefaults):
 								                    pass
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								            class German(Language):
 								                class Defaults(BaseDefaults):
 								                    pass
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
 								    +section("english-init")
 								        +h(3, "english-init")
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								            | #[+tag method] Language.__init__
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								        p
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								            | Load the pipeline.  You can disable components by passing None as a value,
 								            | e.g. pass parser=None, vectors=None to save memory if you're not using
 								            | those components. You can also pass an object as the value.
 								            | Pass a function create_pipeline to use a custom pipeline --- see
 								            | the custom pipeline tutorial.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            +aside("Efficiency").
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                Loading takes 10-20 seconds, and the instance consumes 2 to 3
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                gigabytes of memory.  Intended use is for one instance to be
 								                created for each language per process, but you can create more
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                if you're doing something unusual. You may wish to make the
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                instance a global variable or "singleton".
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +table(["Example", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                +cell #[code nlp = English()]
 								                +cell Load everything, from default path.
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								            +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                +cell #[code nlp = English(path='my_data')]
 								                +cell Load everything, from specified path
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								            +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                +cell #[code nlp = English(path=path_obj)]
 								                +cell Load everything, from an object that follows the #[code pathlib.Path] protocol.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								            +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                +cell #[code nlp = English(parser=False, vectors=False)]
 								                +cell Load everything except the parser and the word vectors.
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								            +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                +cell #[code nlp = English(parser=my_parser)]
 								                +cell Load everything, and use a custom parser.
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								            +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                +cell #[code nlp = English(create_pipeline=my_pipeline)]
 								                +cell Load everything, and use a custom pipeline.
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								        +code("python", "Definition").
 								            def __init__(self, path=True, **overrides):
 								                D = self.Defaults
 								                self.vocab     = Vocab(path=path, parent=self, **D.vocab) \
 								                                 if 'vocab' not in overrides \
 								                                 else overrides['vocab']
 								                self.tokenizer = Tokenizer(self.vocab, path=path, **D.tokenizer) \
 								                                 if 'tokenizer' not in overrides \
 								                                 else overrides['tokenizer']
 								                self.tagger    = Tagger(self.vocab, path=path, **D.tagger) \
 								                                 if 'tagger' not in overrides \
 								                                 else overrides['tagger']
 								                self.parser    = DependencyParser(self.vocab, path=path, **D.parser) \
 								                                 if 'parser' not in overrides \
 								                                 else overrides['parser']
 								                self.entity    = EntityRecognizer(self.vocab, path=path, **D.entity) \
 								                                 if 'entity' not in overrides \
 								                                 else overrides['entity']
 								                self.matcher   = Matcher(self.vocab, path=path, **D.matcher) \
 								                                 if 'matcher' not in overrides \
 								                                 else overrides['matcher']
 								                if 'make_doc' in overrides:
 								                    self.make_doc = overrides['make_doc']
 								                elif 'create_make_doc' in overrides:
 								                    self.make_doc = overrides['create_make_doc'](self)
 								                else:
 								                    self.make_doc = lambda text: self.tokenizer(text)
 								                if 'pipeline' in overrides:
 								                    self.pipeline = overrides['pipeline']
 								                elif 'create_pipeline' in overrides:
 								                    self.pipeline = overrides['create_pipeline'](self)
 								                else:
 								                    self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
 								        +section("language-call")
 								            +h(3, "language-call")
 								                | #[+tag method] Language.__call__
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								            p
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                | The main entry point to spaCy. Takes raw unicode text, and returns
 								                | a #[code Doc] object, which can be iterated to access #[code Token]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                | and #[code Span] objects.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                +aside("Efficiency").
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                    spaCy's algorithms are all linear-time, so you can supply
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    documents of arbitrary length, e.g. whole novels.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            +table(["Example", "Description"], "code")
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                    +cell #[ doc = nlp(u'Some text.')]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell Apply the full pipeline.
 								                +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                    +cell #[ doc = nlp(u'Some text.', parse=False)]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell Applies tagger and entity, not parser
 								                +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                    +cell #[ doc = nlp(u'Some text.', entity=False)]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell Applies tagger and parser, not entity.
 								                +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                    +cell #[ doc = nlp(u'Some text.', tag=False)]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell Does not apply tagger, entity or parser
 								                +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                    +cell #[ doc = nlp(u'')]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell Zero-length tokens, not an error
 								                +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                    +cell #[ doc = nlp(b'Some text')]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell Error: need unicode
 								                +row
-												Rename "English" section to "Language"

											
										
										
											2016-10-21 01:58:24 +03:00
+								                    +cell #[ doc = nlp(b'Some text'.decode('utf8'))]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell Decode bytes into unicode first.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            +code("python", "Definition").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                def __call__(self, text, tag=True, parse=True, entity=True, matcher=True):
 								                    return self
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            +table(["Name", "Type", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                +row
 								                    +cell text
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    +cell #[+a(link_unicode) unicode]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                        The text to be processed. spaCy expects raw unicode text
 								                        – you don"t necessarily need to, say, split it into paragraphs.
 								                        However, depending on your documents, you might be better
 								                        off applying custom pre-processing. Non-text formatting,
 								                        e.g. from HTML mark-up, should be removed before sending
 								                        the document to spaCy. If your documents have a consistent
 								                        format, you may be able to improve accuracy by pre-processing.
 								                        For instance, if the first word of your documents are always
 								                        in upper-case, it may be helpful to normalize them before
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                        supplying them to spaCy.
 								                +row
 								                    +cell tag
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    +cell #[+a(link_bool) bool]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                        Whether to apply the part-of-speech tagger. Required for
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                        parsing and entity recognition.
 								                +row
 								                    +cell parse
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    +cell #[+a(link_bool) bool]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell.
 								                        Whether to apply the syntactic dependency parser.
 								                +row
 								                    +cell entity
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    +cell #[+a(link_bool) bool]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell.
 								                        Whether to apply the named entity recognizer.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								        +section("english-pipe")
 								            +h(3, "english-pipe")
 								                | #[+tag method] English.pipe
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
 								            p
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                | Parse a sequence of texts into a sequence of #[code Doc] objects.
 								                | Accepts a generator as input, and produces a generator as output.
 								                | Internally, it accumulates a buffer of #[code batch_size]
 								                | texts, works on them with #[code n_threads] workers in parallel,
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                | and then yields the #[code Doc] objects one by one.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
 								                +aside("Efficiency").
 								                    spaCy releases the global interpreter lock around the parser and
 								                    named entity recognizer, allowing shared-memory parallelism via
 								                    OpenMP. However, OpenMP is not supported on OSX — so multiple
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    threads will only be used on Linux and Windows.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            +table(["Example", "Description"], "usage")
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                +row
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    +cell #[+a("https://github.com/" + SOCIAL.github + "/spaCy/blob/master/examples/parallel_parse.py") parallel_parse.py]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell Parse comments from Reddit in parallel.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            +code("python", "Definition").
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                def pipe(self, texts, n_threads=2, batch_size=1000):
 								                    yield Doc()
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								            +table(["Arg", "Type", "Description"])
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                +row
 								                    +cell texts
 								                    +cell
 								                    +cell.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                        A sequence of unicode objects. Usually you will want this
 								                        to be a generator, so that you don"t need to have all of
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                        your texts in memory.
 								                +row
 								                    +cell n_threads
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    +cell #[+a(link_int) int]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                        The number of worker threads to use. If -1, OpenMP will
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                        decide how many to use at run time. Default is 2.
 								                +row
 								                    +cell batch_size
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                    +cell #[+a(link_int) int]
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                    +cell.
-												Update website

											
										
										
											2016-10-03 21:19:13 +03:00
+								                        The number of texts to buffer. Let"s say you have a
 								                        #[code batch_size] of 1,000. The input, #[code texts], is
 								                        a generator that yields the texts one-by-one. We want to
 								                        operate on them in parallel. So, we accumulate a work queue.
 								                        Instead of taking one document from #[code texts] and
 								                        operating on it, we buffer #[code batch_size] documents,
 								                        work on them in parallel, and then yield them one-by-one.
-												Replace website with new version

											
										
										
											2016-03-31 17:24:48 +03:00
+								                        Higher #[code batch_size] therefore often results in better
 								                        parallelism, up to a point.