mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			259 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			259 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
//- ----------------------------------
 | 
						||
//- 💫 DOCS > API > LANGUAGE
 | 
						||
//- ----------------------------------
 | 
						||
 | 
						||
+section("language")
 | 
						||
    +h(2, "language", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/language.py")
 | 
						||
        | #[+tag class] Language
 | 
						||
 | 
						||
    p.
 | 
						||
        A pipeline that transforms text strings into annotated spaCy Doc objects. Usually you'll load the Language pipeline once and pass the instance around your program.
 | 
						||
 | 
						||
    +code("python", "Overview").
 | 
						||
        class Language:
 | 
						||
            Defaults = BaseDefaults
 | 
						||
 | 
						||
            def __init__(self, path=True, **overrides):
 | 
						||
                self.vocab = Vocab()
 | 
						||
                self.tokenizer = Tokenizer()
 | 
						||
                self.tagger = Tagger()
 | 
						||
                self.parser = DependencyParser()
 | 
						||
                self.entity = EntityRecognizer()
 | 
						||
                self.make_doc = lambda text: Doc()
 | 
						||
                self.pipeline = [self.tagger, self.parser, self.entity]
 | 
						||
 | 
						||
            def __call__(self, text, **toggle):
 | 
						||
                doc = self.make_doc(text)
 | 
						||
                for proc in self.pipeline:
 | 
						||
                    if toggle.get(process.name, True):
 | 
						||
                        process(doc)
 | 
						||
                return doc
 | 
						||
 | 
						||
            def pipe(self, texts_iterator, batch_size=1000, n_threads=2, **toggle):
 | 
						||
                docs = (self.make_doc(text) for text in texts_iterator)
 | 
						||
                for process in self.pipeline:
 | 
						||
                    if toggle.get(process.name, True):
 | 
						||
                        docs = process.pipe(docs, batch_size=batch_size, n_threads=n_threads)
 | 
						||
                for doc in self.docs:
 | 
						||
                    yield doc
 | 
						||
 | 
						||
            def end_training(self, path=None):
 | 
						||
                return None
 | 
						||
 | 
						||
            class English(Language):
 | 
						||
                class Defaults(BaseDefaults):
 | 
						||
                    pass
 | 
						||
 | 
						||
            class German(Language):
 | 
						||
                class Defaults(BaseDefaults):
 | 
						||
                    pass
 | 
						||
 | 
						||
    +section("english-init")
 | 
						||
        +h(3, "english-init")
 | 
						||
            | #[+tag method] Language.__init__
 | 
						||
 | 
						||
        p
 | 
						||
            | Load the pipeline.  You can disable components by passing None as a value,
 | 
						||
            | e.g. pass parser=None, vectors=None to save memory if you're not using
 | 
						||
            | those components. You can also pass an object as the value.
 | 
						||
            | Pass a function create_pipeline to use a custom pipeline --- see
 | 
						||
            | the custom pipeline tutorial.
 | 
						||
 | 
						||
            +aside("Efficiency").
 | 
						||
                Loading takes 10-20 seconds, and the instance consumes 2 to 3
 | 
						||
                gigabytes of memory.  Intended use is for one instance to be
 | 
						||
                created for each language per process, but you can create more
 | 
						||
                if you're doing something unusual. You may wish to make the
 | 
						||
                instance a global variable or "singleton".
 | 
						||
 | 
						||
        +table(["Example", "Description"])
 | 
						||
            +row
 | 
						||
                +cell #[code nlp = English()]
 | 
						||
                +cell Load everything, from default path.
 | 
						||
 | 
						||
            +row
 | 
						||
                +cell #[code nlp = English(path='my_data')]
 | 
						||
                +cell Load everything, from specified path
 | 
						||
 | 
						||
            +row
 | 
						||
                +cell #[code nlp = English(path=path_obj)]
 | 
						||
                +cell Load everything, from an object that follows the #[code pathlib.Path] protocol.
 | 
						||
 | 
						||
            +row
 | 
						||
                +cell #[code nlp = English(parser=False, vectors=False)]
 | 
						||
                +cell Load everything except the parser and the word vectors.
 | 
						||
 | 
						||
            +row
 | 
						||
                +cell #[code nlp = English(parser=my_parser)]
 | 
						||
                +cell Load everything, and use a custom parser.
 | 
						||
 | 
						||
            +row
 | 
						||
                +cell #[code nlp = English(create_pipeline=my_pipeline)]
 | 
						||
                +cell Load everything, and use a custom pipeline.
 | 
						||
 | 
						||
        +code("python", "Definition").
 | 
						||
            def __init__(self, path=True, **overrides):
 | 
						||
                D = self.Defaults
 | 
						||
                self.vocab     = Vocab(path=path, parent=self, **D.vocab) \
 | 
						||
                                 if 'vocab' not in overrides \
 | 
						||
                                 else overrides['vocab']
 | 
						||
                self.tokenizer = Tokenizer(self.vocab, path=path, **D.tokenizer) \
 | 
						||
                                 if 'tokenizer' not in overrides \
 | 
						||
                                 else overrides['tokenizer']
 | 
						||
                self.tagger    = Tagger(self.vocab, path=path, **D.tagger) \
 | 
						||
                                 if 'tagger' not in overrides \
 | 
						||
                                 else overrides['tagger']
 | 
						||
                self.parser    = DependencyParser(self.vocab, path=path, **D.parser) \
 | 
						||
                                 if 'parser' not in overrides \
 | 
						||
                                 else overrides['parser']
 | 
						||
                self.entity    = EntityRecognizer(self.vocab, path=path, **D.entity) \
 | 
						||
                                 if 'entity' not in overrides \
 | 
						||
                                 else overrides['entity']
 | 
						||
                self.matcher   = Matcher(self.vocab, path=path, **D.matcher) \
 | 
						||
                                 if 'matcher' not in overrides \
 | 
						||
                                 else overrides['matcher']
 | 
						||
 | 
						||
                if 'make_doc' in overrides:
 | 
						||
                    self.make_doc = overrides['make_doc']
 | 
						||
                elif 'create_make_doc' in overrides:
 | 
						||
                    self.make_doc = overrides['create_make_doc'](self)
 | 
						||
                else:
 | 
						||
                    self.make_doc = lambda text: self.tokenizer(text)
 | 
						||
                if 'pipeline' in overrides:
 | 
						||
                    self.pipeline = overrides['pipeline']
 | 
						||
                elif 'create_pipeline' in overrides:
 | 
						||
                    self.pipeline = overrides['create_pipeline'](self)
 | 
						||
                else:
 | 
						||
                    self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
 | 
						||
 | 
						||
        +section("language-call")
 | 
						||
            +h(3, "language-call")
 | 
						||
                | #[+tag method] Language.__call__
 | 
						||
 | 
						||
            p
 | 
						||
                | The main entry point to spaCy. Takes raw unicode text, and returns
 | 
						||
                | a #[code Doc] object, which can be iterated to access #[code Token]
 | 
						||
                | and #[code Span] objects.
 | 
						||
 | 
						||
                +aside("Efficiency").
 | 
						||
                    spaCy's algorithms are all linear-time, so you can supply
 | 
						||
                    documents of arbitrary length, e.g. whole novels.
 | 
						||
 | 
						||
            +table(["Example", "Description"], "code")
 | 
						||
                +row
 | 
						||
                    +cell #[ doc = nlp(u'Some text.')]
 | 
						||
                    +cell Apply the full pipeline.
 | 
						||
                +row
 | 
						||
                    +cell #[ doc = nlp(u'Some text.', parse=False)]
 | 
						||
                    +cell Applies tagger and entity, not parser
 | 
						||
                +row
 | 
						||
                    +cell #[ doc = nlp(u'Some text.', entity=False)]
 | 
						||
                    +cell Applies tagger and parser, not entity.
 | 
						||
                +row
 | 
						||
                    +cell #[ doc = nlp(u'Some text.', tag=False)]
 | 
						||
                    +cell Does not apply tagger, entity or parser
 | 
						||
                +row
 | 
						||
                    +cell #[ doc = nlp(u'')]
 | 
						||
                    +cell Zero-length tokens, not an error
 | 
						||
                +row
 | 
						||
                    +cell #[ doc = nlp(b'Some text')]
 | 
						||
                    +cell Error: need unicode
 | 
						||
                +row
 | 
						||
                    +cell #[ doc = nlp(b'Some text'.decode('utf8'))]
 | 
						||
                    +cell Decode bytes into unicode first.
 | 
						||
 | 
						||
            +code("python", "Definition").
 | 
						||
                def __call__(self, text, tag=True, parse=True, entity=True, matcher=True):
 | 
						||
                    return self
 | 
						||
 | 
						||
            +table(["Name", "Type", "Description"])
 | 
						||
                +row
 | 
						||
                    +cell text
 | 
						||
                    +cell #[+a(link_unicode) unicode]
 | 
						||
                    +cell.
 | 
						||
                        The text to be processed. spaCy expects raw unicode text
 | 
						||
                        – you don"t necessarily need to, say, split it into paragraphs.
 | 
						||
                        However, depending on your documents, you might be better
 | 
						||
                        off applying custom pre-processing. Non-text formatting,
 | 
						||
                        e.g. from HTML mark-up, should be removed before sending
 | 
						||
                        the document to spaCy. If your documents have a consistent
 | 
						||
                        format, you may be able to improve accuracy by pre-processing.
 | 
						||
                        For instance, if the first word of your documents are always
 | 
						||
                        in upper-case, it may be helpful to normalize them before
 | 
						||
                        supplying them to spaCy.
 | 
						||
 | 
						||
                +row
 | 
						||
                    +cell tag
 | 
						||
                    +cell #[+a(link_bool) bool]
 | 
						||
                    +cell.
 | 
						||
                        Whether to apply the part-of-speech tagger. Required for
 | 
						||
                        parsing and entity recognition.
 | 
						||
 | 
						||
                +row
 | 
						||
                    +cell parse
 | 
						||
                    +cell #[+a(link_bool) bool]
 | 
						||
                    +cell.
 | 
						||
                        Whether to apply the syntactic dependency parser.
 | 
						||
 | 
						||
                +row
 | 
						||
                    +cell entity
 | 
						||
                    +cell #[+a(link_bool) bool]
 | 
						||
                    +cell.
 | 
						||
                        Whether to apply the named entity recognizer.
 | 
						||
 | 
						||
        +section("english-pipe")
 | 
						||
            +h(3, "english-pipe")
 | 
						||
                | #[+tag method] English.pipe
 | 
						||
 | 
						||
            p
 | 
						||
                | Parse a sequence of texts into a sequence of #[code Doc] objects.
 | 
						||
                | Accepts a generator as input, and produces a generator as output.
 | 
						||
                | Internally, it accumulates a buffer of #[code batch_size]
 | 
						||
                | texts, works on them with #[code n_threads] workers in parallel,
 | 
						||
                | and then yields the #[code Doc] objects one by one.
 | 
						||
 | 
						||
                +aside("Efficiency").
 | 
						||
                    spaCy releases the global interpreter lock around the parser and
 | 
						||
                    named entity recognizer, allowing shared-memory parallelism via
 | 
						||
                    OpenMP. However, OpenMP is not supported on OSX — so multiple
 | 
						||
                    threads will only be used on Linux and Windows.
 | 
						||
 | 
						||
            +table(["Example", "Description"], "usage")
 | 
						||
                +row
 | 
						||
                    +cell #[+a("https://github.com/" + SOCIAL.github + "/spaCy/blob/master/examples/parallel_parse.py") parallel_parse.py]
 | 
						||
                    +cell Parse comments from Reddit in parallel.
 | 
						||
 | 
						||
            +code("python", "Definition").
 | 
						||
                def pipe(self, texts, n_threads=2, batch_size=1000):
 | 
						||
                    yield Doc()
 | 
						||
 | 
						||
            +table(["Arg", "Type", "Description"])
 | 
						||
                +row
 | 
						||
                    +cell texts
 | 
						||
                    +cell
 | 
						||
                    +cell.
 | 
						||
                        A sequence of unicode objects. Usually you will want this
 | 
						||
                        to be a generator, so that you don"t need to have all of
 | 
						||
                        your texts in memory.
 | 
						||
 | 
						||
                +row
 | 
						||
                    +cell n_threads
 | 
						||
                    +cell #[+a(link_int) int]
 | 
						||
                    +cell.
 | 
						||
                        The number of worker threads to use. If -1, OpenMP will
 | 
						||
                        decide how many to use at run time. Default is 2.
 | 
						||
 | 
						||
                +row
 | 
						||
                    +cell batch_size
 | 
						||
                    +cell #[+a(link_int) int]
 | 
						||
                    +cell.
 | 
						||
                        The number of texts to buffer. Let"s say you have a
 | 
						||
                        #[code batch_size] of 1,000. The input, #[code texts], is
 | 
						||
                        a generator that yields the texts one-by-one. We want to
 | 
						||
                        operate on them in parallel. So, we accumulate a work queue.
 | 
						||
                        Instead of taking one document from #[code texts] and
 | 
						||
                        operating on it, we buffer #[code batch_size] documents,
 | 
						||
                        work on them in parallel, and then yield them one-by-one.
 | 
						||
                        Higher #[code batch_size] therefore often results in better
 | 
						||
                        parallelism, up to a point.
 |