mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
e6dd01fc90
|
@ -18,7 +18,7 @@
|
||||||
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M176 58h103.3L296 88l-16.8 30H176l16.8-30z"/>
|
<path fill="#f8cecc" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M176 58h103.3L296 88l-16.8 30H176l16.8-30z"/>
|
||||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
|
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="58" height="14" transform="translate(206.5 80.5)">tokenizer</text>
|
||||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M314 58h103.3L434 88l-16.8 30H314l16.8-30z"/>
|
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M314 58h103.3L434 88l-16.8 30H314l16.8-30z"/>
|
||||||
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">vectorizer</text>
|
<text class="svg__pipeline__text-small" dy="0.75em" dx="-0.25em" width="62" height="14" transform="translate(342.5 80.5)">tensorizer</text>
|
||||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M296.5 88.2h24.7"/>
|
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M296.5 88.2h24.7"/>
|
||||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 88.2l-8 4 2-4-2-4z"/>
|
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M327.2 88.2l-8 4 2-4-2-4z"/>
|
||||||
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M416 58h103.3L536 88l-16.8 30H416l16.8-30z"/>
|
<path fill="#ffe6cc" stroke="#d79b00" stroke-width="2" stroke-miterlimit="10" d="M416 58h103.3L536 88l-16.8 30H416l16.8-30z"/>
|
||||||
|
|
Before Width: | Height: | Size: 3.2 KiB After Width: | Height: | Size: 3.2 KiB |
|
@ -6,7 +6,7 @@ p
|
||||||
| different steps – this is also referred to as the
|
| different steps – this is also referred to as the
|
||||||
| #[strong processing pipeline]. The pipeline used by the
|
| #[strong processing pipeline]. The pipeline used by the
|
||||||
| #[+a("/docs/usage/models") default models] consists of a
|
| #[+a("/docs/usage/models") default models] consists of a
|
||||||
| vectorizer, a tagger, a parser and an entity recognizer. Each pipeline
|
| tensorizer, a tagger, a parser and an entity recognizer. Each pipeline
|
||||||
| component returns the processed #[code Doc], which is then passed on to
|
| component returns the processed #[code Doc], which is then passed on to
|
||||||
| the next component.
|
| the next component.
|
||||||
|
|
||||||
|
@ -21,21 +21,24 @@ p
|
||||||
| #[strong Creates:] Objects, attributes and properties modified and set by
|
| #[strong Creates:] Objects, attributes and properties modified and set by
|
||||||
| the component.
|
| the component.
|
||||||
|
|
||||||
+table(["Name", "Component", "Creates"])
|
+table(["Name", "Component", "Creates", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell tokenizer
|
+cell tokenizer
|
||||||
+cell #[+api("tokenizer") #[code Tokenizer]]
|
+cell #[+api("tokenizer") #[code Tokenizer]]
|
||||||
+cell #[code Doc]
|
+cell #[code Doc]
|
||||||
|
+cell Segment text into tokens.
|
||||||
|
|
||||||
+row("divider")
|
+row("divider")
|
||||||
+cell vectorizer
|
+cell tensorizer
|
||||||
+cell #[code Vectorizer]
|
+cell #[code TokenVectorEncoder]
|
||||||
+cell #[code Doc.tensor]
|
+cell #[code Doc.tensor]
|
||||||
|
+cell Create feature representation tensor for #[code Doc].
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell tagger
|
+cell tagger
|
||||||
+cell #[+api("tagger") #[code Tagger]]
|
+cell #[+api("tagger") #[code Tagger]]
|
||||||
+cell #[code Doc[i].tag]
|
+cell #[code Doc[i].tag]
|
||||||
|
+cell Assign part-of-speech tags.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell parser
|
+cell parser
|
||||||
|
@ -43,11 +46,13 @@ p
|
||||||
+cell
|
+cell
|
||||||
| #[code Doc[i].head], #[code Doc[i].dep], #[code Doc.sents],
|
| #[code Doc[i].head], #[code Doc[i].dep], #[code Doc.sents],
|
||||||
| #[code Doc.noun_chunks]
|
| #[code Doc.noun_chunks]
|
||||||
|
+cell Assign dependency labels.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell ner
|
+cell ner
|
||||||
+cell #[+api("entityrecognizer") #[code EntityRecognizer]]
|
+cell #[+api("entityrecognizer") #[code EntityRecognizer]]
|
||||||
+cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
|
+cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
|
||||||
|
+cell Detect and label named entities.
|
||||||
|
|
||||||
p
|
p
|
||||||
| The processing pipeline always #[strong depends on the statistical model]
|
| The processing pipeline always #[strong depends on the statistical model]
|
||||||
|
@ -57,4 +62,4 @@ p
|
||||||
| in its meta data, as a simple list containing the component names:
|
| in its meta data, as a simple list containing the component names:
|
||||||
|
|
||||||
+code(false, "json").
|
+code(false, "json").
|
||||||
"pipeline": ["vectorizer", "tagger", "parser", "ner"]
|
"pipeline": ["tensorizer", "tagger", "parser", "ner"]
|
||||||
|
|
|
@ -102,8 +102,8 @@ p
|
||||||
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
|
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
|
||||||
|
|
||||||
p
|
p
|
||||||
| If the doc's vocabulary doesn't contain a hash for "coffee", spaCy will
|
| If the vocabulary doesn't contain a hash for "coffee", spaCy will
|
||||||
| throw an error. So you either need to add it manually, or initialise the
|
| throw an error. So you either need to add it manually, or initialise the
|
||||||
| new #[code Doc] with the shared vocab. To prevent this problem, spaCy
|
| new #[code Doc] with the shared vocabulary. To prevent this problem,
|
||||||
| will ususally export the vocab when you save a #[code Doc] or #[code nlp]
|
| spaCy will also export the #[code Vocab] when you save a
|
||||||
| object.
|
| #[code Doc] or #[code nlp] object.
|
||||||
|
|
|
@ -10,7 +10,7 @@ include _spacy-101/_pipelines
|
||||||
|
|
||||||
p
|
p
|
||||||
| spaCy makes it very easy to create your own pipelines consisting of
|
| spaCy makes it very easy to create your own pipelines consisting of
|
||||||
| reusable components – this includes spaCy's default vectorizer, tagger,
|
| reusable components – this includes spaCy's default tensorizer, tagger,
|
||||||
| parser and entity regcognizer, but also your own custom processing
|
| parser and entity regcognizer, but also your own custom processing
|
||||||
| functions. A pipeline component can be added to an already existing
|
| functions. A pipeline component can be added to an already existing
|
||||||
| #[code nlp] object, specified when initialising a #[code Language] class,
|
| #[code nlp] object, specified when initialising a #[code Language] class,
|
||||||
|
@ -56,7 +56,7 @@ p
|
||||||
|
|
||||||
p
|
p
|
||||||
| ... the model tells spaCy to use the pipeline
|
| ... the model tells spaCy to use the pipeline
|
||||||
| #[code ["vectorizer", "tagger", "parser", "ner"]]. spaCy will then look
|
| #[code ["tensorizer", "tagger", "parser", "ner"]]. spaCy will then look
|
||||||
| up each string in its internal factories registry and initialise the
|
| up each string in its internal factories registry and initialise the
|
||||||
| individual components. It'll then load #[code spacy.lang.en.English],
|
| individual components. It'll then load #[code spacy.lang.en.English],
|
||||||
| pass it the path to the model's data directory, and return it for you
|
| pass it the path to the model's data directory, and return it for you
|
||||||
|
@ -230,7 +230,7 @@ p
|
||||||
p
|
p
|
||||||
| Let's say you have trained your own document sentiment model on English
|
| Let's say you have trained your own document sentiment model on English
|
||||||
| text. After tokenization, you want spaCy to first execute the
|
| text. After tokenization, you want spaCy to first execute the
|
||||||
| #[strong default vectorizer], followed by a custom
|
| #[strong default tensorizer], followed by a custom
|
||||||
| #[strong sentiment component] that adds a #[code .sentiment]
|
| #[strong sentiment component] that adds a #[code .sentiment]
|
||||||
| property to the #[code Doc], containing your model's sentiment precition.
|
| property to the #[code Doc], containing your model's sentiment precition.
|
||||||
|
|
||||||
|
@ -293,13 +293,13 @@ p
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"spacy_version": ">=2.0.0,<3.0.0",
|
"spacy_version": ">=2.0.0,<3.0.0",
|
||||||
"pipeline": ["vectorizer", "sentiment"]
|
"pipeline": ["tensorizer", "sentiment"]
|
||||||
}
|
}
|
||||||
|
|
||||||
p
|
p
|
||||||
| When you load your new model, spaCy will call the model's #[code load()]
|
| When you load your new model, spaCy will call the model's #[code load()]
|
||||||
| method. This will return a #[code Language] object with a pipeline
|
| method. This will return a #[code Language] object with a pipeline
|
||||||
| containing the default vectorizer, and the sentiment component returned
|
| containing the default tensorizer, and the sentiment component returned
|
||||||
| by your custom #[code "sentiment"] factory.
|
| by your custom #[code "sentiment"] factory.
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
|
@ -324,7 +324,7 @@ p
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
nlp = spacy.load('en', disable['parser', 'tagger'])
|
nlp = spacy.load('en', disable['parser', 'tagger'])
|
||||||
nlp = English().from_disk('/model', disable=['vectorizer', 'ner'])
|
nlp = English().from_disk('/model', disable=['tensorizer', 'ner'])
|
||||||
doc = nlp(u"I don't want parsed", disable=['parser'])
|
doc = nlp(u"I don't want parsed", disable=['parser'])
|
||||||
|
|
||||||
p
|
p
|
||||||
|
|
|
@ -303,9 +303,9 @@ include _spacy-101/_training
|
||||||
p
|
p
|
||||||
| We're very happy to see the spaCy community grow and include a mix of
|
| We're very happy to see the spaCy community grow and include a mix of
|
||||||
| people from all kinds of different backgrounds – computational
|
| people from all kinds of different backgrounds – computational
|
||||||
| linguistics, data science, deep learning and research. If you'd like to
|
| linguistics, data science, deep learning, research and more. If you'd
|
||||||
| get involved, below are some answers to the most important questions and
|
| like to get involved, below are some answers to the most important
|
||||||
| resources for further reading.
|
| questions and resources for further reading.
|
||||||
|
|
||||||
+h(3, "faq-help-code") Help, my code isn't working!
|
+h(3, "faq-help-code") Help, my code isn't working!
|
||||||
|
|
||||||
|
|
|
@ -67,7 +67,9 @@ p
|
||||||
| mapping #[strong no longer depends on the vocabulary state], making a lot
|
| mapping #[strong no longer depends on the vocabulary state], making a lot
|
||||||
| of workflows much simpler, especially during training. Unlike integer IDs
|
| of workflows much simpler, especially during training. Unlike integer IDs
|
||||||
| in spaCy v1.x, hash values will #[strong always match] – even across
|
| in spaCy v1.x, hash values will #[strong always match] – even across
|
||||||
| models. Strings can now be added explicitly using the new #[+api("stringstore#add") #[code Stringstore.add]] method.
|
| models. Strings can now be added explicitly using the new
|
||||||
|
| #[+api("stringstore#add") #[code Stringstore.add]] method. A token's hash
|
||||||
|
| is available via #[code token.orth].
|
||||||
|
|
||||||
+infobox
|
+infobox
|
||||||
| #[strong API:] #[+api("stringstore") #[code StringStore]]
|
| #[strong API:] #[+api("stringstore") #[code StringStore]]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user