spaCy/website/docs/usage/_spacy-101/_pipelines.jade

//- 💫 DOCS > USAGE > SPACY 101 > PIPELINES

p
    |  When you call #[code nlp] on a text, spaCy first tokenizes the text to
    |  produce a #[code Doc] object. The #[code Doc] is then processed in several
    |  different steps – this is also referred to as the
    |  #[strong processing pipeline]. The pipeline used by the
    |  #[+a("/docs/usage/models") default models] consists of a
    |  tensorizer, a tagger, a parser and an entity recognizer. Each pipeline
    |  component returns the processed #[code Doc], which is then passed on to
    |  the next component.

+image
    include ../../../assets/img/docs/pipeline.svg
    .u-text-right
        +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic

+aside
    |  #[strong Name:] ID of the pipeline component.#[br]
    |  #[strong Component:] spaCy's implementation of the component.#[br]
    |  #[strong Creates:] Objects, attributes and properties modified and set by
    |  the component.

+table(["Name", "Component", "Creates", "Description"])
    +row
        +cell tokenizer
        +cell #[+api("tokenizer") #[code Tokenizer]]
        +cell #[code Doc]
        +cell Segment text into tokens.

    +row("divider")
        +cell tensorizer
        +cell #[code TokenVectorEncoder]
        +cell #[code Doc.tensor]
        +cell Create feature representation tensor for #[code Doc].

    +row
        +cell tagger
        +cell #[+api("tagger") #[code Tagger]]
        +cell #[code Doc[i].tag]
        +cell Assign part-of-speech tags.

    +row
        +cell parser
        +cell #[+api("dependencyparser") #[code DependencyParser]]
        +cell
            |  #[code Doc[i].head], #[code Doc[i].dep], #[code Doc.sents],
            |  #[code Doc.noun_chunks]
        +cell Assign dependency labels.

    +row
        +cell ner
        +cell #[+api("entityrecognizer") #[code EntityRecognizer]]
        +cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
        +cell Detect and label named entities.

p
    |  The processing pipeline always #[strong depends on the statistical model]
    |  and its capabilities. For example, a pipeline can only include an entity
    |  recognizer component if the model includes data to make predictions of
    |  entity labels. This is why each model will specify the pipeline to use
    |  in its meta data, as a simple list containing the component names:

+code(false, "json").
    "pipeline": ["tensorizer", "tagger", "parser", "ner"]

p
    |  Although you can mix and match pipeline components, their
    |  #[strong order and combination] is usually important. Some components may
    |  require certain modifications on the #[code Doc] to process it. For
    |  example, the default pipeline first applies the tensorizer, which
    |  pre-processes the doc and encodes its internal
    |  #[strong meaning representations] as an array of floats, also called a
    |  #[strong tensor]. This includes the tokens and their context, which is
    |  required for the next component, the tagger, to make predictions of the
    |  part-of-speech tags. Because spaCy's models are neural network models,
    |  they only "speak" tensors and expect the input #[code Doc] to have
    |  a #[code tensor].
-												Add pipelines 101 and rewrite pipelines workflow

											
										
										
											2017-05-24 20:25:13 +03:00
+								//- 💫 DOCS > USAGE > SPACY 101 > PIPELINES
 								p
 								    |  When you call #[code nlp] on a text, spaCy first tokenizes the text to
-												Fix typos, text, examples and formatting

											
										
										
											2017-05-25 12:17:21 +03:00
+								    |  produce a #[code Doc] object. The #[code Doc] is then processed in several
-												Add pipelines 101 and rewrite pipelines workflow

											
										
										
											2017-05-24 20:25:13 +03:00
+								    |  different steps – this is also referred to as the
-												Fix typos, text, examples and formatting

											
										
										
											2017-05-25 12:17:21 +03:00
+								    |  #[strong processing pipeline]. The pipeline used by the
-												Add pipelines 101 and rewrite pipelines workflow

											
										
										
											2017-05-24 20:25:13 +03:00
+								    |  #[+a("/docs/usage/models") default models] consists of a
-												Update texts and rename vectorizer to tensorizer

											
										
										
											2017-05-29 00:26:13 +03:00
+								    |  tensorizer, a tagger, a parser and an entity recognizer. Each pipeline
-												Add pipelines 101 and rewrite pipelines workflow

											
										
										
											2017-05-24 20:25:13 +03:00
+								    |  component returns the processed #[code Doc], which is then passed on to
 								    |  the next component.
 								+image
 								    include ../../../assets/img/docs/pipeline.svg
 								    .u-text-right
 								        +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic
-												Add aside to pipeline 101 table

											
										
										
											2017-05-24 23:46:18 +03:00
+								+aside
 								    |  #[strong Name:] ID of the pipeline component.#[br]
 								    |  #[strong Component:] spaCy's implementation of the component.#[br]
 								    |  #[strong Creates:] Objects, attributes and properties modified and set by
 								    |  the component.
-												Update texts and rename vectorizer to tensorizer

											
										
										
											2017-05-29 00:26:13 +03:00
+								+table(["Name", "Component", "Creates", "Description"])
-												Add pipelines 101 and rewrite pipelines workflow

											
										
										
											2017-05-24 20:25:13 +03:00
+								    +row
 								        +cell tokenizer
 								        +cell #[+api("tokenizer") #[code Tokenizer]]
 								        +cell #[code Doc]
-												Update texts and rename vectorizer to tensorizer

											
										
										
											2017-05-29 00:26:13 +03:00
+								        +cell Segment text into tokens.
-												Add pipelines 101 and rewrite pipelines workflow

											
										
										
											2017-05-24 20:25:13 +03:00
 								    +row("divider")
-												Update texts and rename vectorizer to tensorizer

											
										
										
											2017-05-29 00:26:13 +03:00
+								        +cell tensorizer
 								        +cell #[code TokenVectorEncoder]
-												Add pipelines 101 and rewrite pipelines workflow

											
										
										
											2017-05-24 20:25:13 +03:00
+								        +cell #[code Doc.tensor]
-												Update texts and rename vectorizer to tensorizer

											
										
										
											2017-05-29 00:26:13 +03:00
+								        +cell Create feature representation tensor for #[code Doc].
-												Add pipelines 101 and rewrite pipelines workflow

											
										
										
											2017-05-24 20:25:13 +03:00
 								    +row
 								        +cell tagger
 								        +cell #[+api("tagger") #[code Tagger]]
 								        +cell #[code Doc[i].tag]
-												Update texts and rename vectorizer to tensorizer

											
										
										
											2017-05-29 00:26:13 +03:00
+								        +cell Assign part-of-speech tags.
-												Add pipelines 101 and rewrite pipelines workflow

											
										
										
											2017-05-24 20:25:13 +03:00
 								    +row
 								        +cell parser
 								        +cell #[+api("dependencyparser") #[code DependencyParser]]
 								        +cell
 								            |  #[code Doc[i].head], #[code Doc[i].dep], #[code Doc.sents],
 								            |  #[code Doc.noun_chunks]
-												Update texts and rename vectorizer to tensorizer

											
										
										
											2017-05-29 00:26:13 +03:00
+								        +cell Assign dependency labels.
-												Add pipelines 101 and rewrite pipelines workflow

											
										
										
											2017-05-24 20:25:13 +03:00
 								    +row
 								        +cell ner
 								        +cell #[+api("entityrecognizer") #[code EntityRecognizer]]
 								        +cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
-												Update texts and rename vectorizer to tensorizer

											
										
										
											2017-05-29 00:26:13 +03:00
+								        +cell Detect and label named entities.
-												Update usage and 101 docs

											
										
										
											2017-05-26 13:46:29 +03:00
 								p
 								    |  The processing pipeline always #[strong depends on the statistical model]
 								    |  and its capabilities. For example, a pipeline can only include an entity
 								    |  recognizer component if the model includes data to make predictions of
 								    |  entity labels. This is why each model will specify the pipeline to use
 								    |  in its meta data, as a simple list containing the component names:
 								+code(false, "json").
-												Update texts and rename vectorizer to tensorizer

											
										
										
											2017-05-29 00:26:13 +03:00
+								    "pipeline": ["tensorizer", "tagger", "parser", "ner"]
-												Update 101 and add note on pipeline order and tensors

											
										
										
											2017-05-29 12:45:32 +03:00
 								p
 								    |  Although you can mix and match pipeline components, their
 								    |  #[strong order and combination] is usually important. Some components may
 								    |  require certain modifications on the #[code Doc] to process it. For
 								    |  example, the default pipeline first applies the tensorizer, which
 								    |  pre-processes the doc and encodes its internal
 								    |  #[strong meaning representations] as an array of floats, also called a
 								    |  #[strong tensor]. This includes the tokens and their context, which is
 								    |  required for the next component, the tagger, to make predictions of the
 								    |  part-of-speech tags. Because spaCy's models are neural network models,
 								    |  they only "speak" tensors and expect the input #[code Doc] to have
 								    |  a #[code tensor].