mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Update usage and 101 docs
This commit is contained in:
parent
6d76c1ea16
commit
286c3d0719
|
@ -80,7 +80,7 @@
|
||||||
},
|
},
|
||||||
|
|
||||||
"customizing-tokenizer": {
|
"customizing-tokenizer": {
|
||||||
"title": "Customizing the tokenizer",
|
"title": "Customising the tokenizer",
|
||||||
"next": "rule-based-matching"
|
"next": "rule-based-matching"
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
|
@ -48,3 +48,13 @@ p
|
||||||
+cell ner
|
+cell ner
|
||||||
+cell #[+api("entityrecognizer") #[code EntityRecognizer]]
|
+cell #[+api("entityrecognizer") #[code EntityRecognizer]]
|
||||||
+cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
|
+cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type]
|
||||||
|
|
||||||
|
p
|
||||||
|
| The processing pipeline always #[strong depends on the statistical model]
|
||||||
|
| and its capabilities. For example, a pipeline can only include an entity
|
||||||
|
| recognizer component if the model includes data to make predictions of
|
||||||
|
| entity labels. This is why each model will specify the pipeline to use
|
||||||
|
| in its meta data, as a simple list containing the component names:
|
||||||
|
|
||||||
|
+code(false, "json").
|
||||||
|
"pipeline": ["vectorizer", "tagger", "parser", "ner"]
|
||||||
|
|
|
@ -34,7 +34,35 @@ p
|
||||||
+annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
|
+annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style)
|
||||||
+annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)
|
+annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style)
|
||||||
|
|
||||||
|
p
|
||||||
|
| For example, if you've processed a very large document, you can use
|
||||||
|
| #[+api("doc#to_disk") #[code Doc.to_disk]] to save it to a file on your
|
||||||
|
| local machine. This will save the document and its tokens, as well as
|
||||||
|
| the vocabulary associated with the #[code Doc].
|
||||||
|
|
||||||
|
+aside("Why saving the vocab?")
|
||||||
|
| Saving the vocabulary with the #[code Doc] is important, because the
|
||||||
|
| #[code Vocab] holds the context-independent information about the words,
|
||||||
|
| tags and labels, and their #[strong integer IDs]. If the #[code Vocab]
|
||||||
|
| wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve
|
||||||
|
| those IDs – for example, the word text or the dependency labels. You
|
||||||
|
| might be saving #[code 446] for "whale", but in a different vocabulary,
|
||||||
|
| this ID could map to "VERB". Similarly, if your document was processed by
|
||||||
|
| a German model, its vocab will include the specific
|
||||||
|
| #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels].
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
moby_dick = open('moby_dick.txt', 'r') # open a large document
|
moby_dick = open('moby_dick.txt', 'r') # open a large document
|
||||||
doc = nlp(moby_dick) # process it
|
doc = nlp(moby_dick) # process it
|
||||||
doc.to_disk('/moby_dick.bin') # save the processed Doc
|
doc.to_disk('/moby_dick.bin') # save the processed Doc
|
||||||
|
|
||||||
|
p
|
||||||
|
| If you need it again later, you can load it back into an empty #[code Doc]
|
||||||
|
| with an empty #[code Vocab] by calling
|
||||||
|
| #[+api("doc#from_disk") #[code from_disk()]]:
|
||||||
|
|
||||||
|
+code.
|
||||||
|
from spacy.tokens import Doc # to create empty Doc
|
||||||
|
from spacy.vocab import Vocab # to create empty Vocab
|
||||||
|
|
||||||
|
doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc
|
||||||
|
|
|
@ -322,8 +322,9 @@ p
|
||||||
| If you don't need a particular component of the pipeline – for
|
| If you don't need a particular component of the pipeline – for
|
||||||
| example, the tagger or the parser, you can disable loading it. This can
|
| example, the tagger or the parser, you can disable loading it. This can
|
||||||
| sometimes make a big difference and improve loading speed. Disabled
|
| sometimes make a big difference and improve loading speed. Disabled
|
||||||
| component names can be provided to #[code spacy.load], #[code from_disk]
|
| component names can be provided to #[+api("spacy#load") #[code spacy.load]],
|
||||||
| or the #[code nlp] object itself as a list:
|
| #[+api("language#from_disk") #[code Language.from_disk]] or the
|
||||||
|
| #[code nlp] object itself as a list:
|
||||||
|
|
||||||
+code.
|
+code.
|
||||||
nlp = spacy.load('en', disable['parser', 'tagger'])
|
nlp = spacy.load('en', disable['parser', 'tagger'])
|
||||||
|
|
|
@ -35,7 +35,7 @@ p
|
||||||
assert doc[0].text == u'Peach'
|
assert doc[0].text == u'Peach'
|
||||||
assert doc[1].text == u'emoji'
|
assert doc[1].text == u'emoji'
|
||||||
assert doc[-1].text == u'🍑'
|
assert doc[-1].text == u'🍑'
|
||||||
assert doc[17:19] == u'outranking eggplant'
|
assert doc[17:19].text == u'outranking eggplant'
|
||||||
assert doc.noun_chunks[0].text == u'Peach emoji'
|
assert doc.noun_chunks[0].text == u'Peach emoji'
|
||||||
|
|
||||||
sentences = list(doc.sents)
|
sentences = list(doc.sents)
|
||||||
|
|
|
@ -91,17 +91,35 @@ p
|
||||||
|
|
||||||
include _spacy-101/_tokenization
|
include _spacy-101/_tokenization
|
||||||
|
|
||||||
|
+infobox
|
||||||
|
| To learn more about how spaCy's tokenizer and its rules work in detail,
|
||||||
|
| how to #[strong customise] it and how to #[strong add your own tokenizer]
|
||||||
|
| to a processing pipeline, see the usage guide on
|
||||||
|
| #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
|
||||||
|
|
||||||
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
|
+h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
|
||||||
+tag-model("dependency parse")
|
+tag-model("dependency parse")
|
||||||
|
|
||||||
include _spacy-101/_pos-deps
|
include _spacy-101/_pos-deps
|
||||||
|
|
||||||
|
+infobox
|
||||||
|
| To learn more about #[strong part-of-speech tagging] and rule-based
|
||||||
|
| morphology, and how to #[strong navigate and use the parse tree]
|
||||||
|
| effectively, see the usage guides on
|
||||||
|
| #[+a("/docs/usage/pos-tagging") part-of-speech tagging] and
|
||||||
|
| #[+a("/docs/usage/dependency-parse") using the dependency parse].
|
||||||
|
|
||||||
+h(3, "annotations-ner") Named Entities
|
+h(3, "annotations-ner") Named Entities
|
||||||
+tag-model("named entities")
|
+tag-model("named entities")
|
||||||
|
|
||||||
include _spacy-101/_named-entities
|
include _spacy-101/_named-entities
|
||||||
|
|
||||||
|
+infobox
|
||||||
|
| To learn more about entity recognition in spaCy, how to
|
||||||
|
| #[strong add your own entities] to a document and how to train and update
|
||||||
|
| the entity predictions of a model, see the usage guide on
|
||||||
|
| #[+a("/docs/usage/entity-recognition") named entity recognition].
|
||||||
|
|
||||||
+h(2, "vectors-similarity") Word vectors and similarity
|
+h(2, "vectors-similarity") Word vectors and similarity
|
||||||
+tag-model("vectors")
|
+tag-model("vectors")
|
||||||
|
|
||||||
|
@ -109,10 +127,22 @@ include _spacy-101/_similarity
|
||||||
|
|
||||||
include _spacy-101/_word-vectors
|
include _spacy-101/_word-vectors
|
||||||
|
|
||||||
|
+infobox
|
||||||
|
| To learn more about word vectors, how to #[strong customise them] and
|
||||||
|
| how to load #[strong your own vectors] into spaCy, see the usage
|
||||||
|
| guide on
|
||||||
|
| #[+a("/docs/usage/word-vectors-similarities") using word vectors and semantic similarities].
|
||||||
|
|
||||||
+h(2, "pipelines") Pipelines
|
+h(2, "pipelines") Pipelines
|
||||||
|
|
||||||
include _spacy-101/_pipelines
|
include _spacy-101/_pipelines
|
||||||
|
|
||||||
|
+infobox
|
||||||
|
| To learn more about #[strong how processing pipelines work] in detail,
|
||||||
|
| how to enable and disable their components, and how to
|
||||||
|
| #[strong create your own], see the usage guide on
|
||||||
|
| #[+a("/docs/usage/language-processing-pipeline") language processing pipelines].
|
||||||
|
|
||||||
+h(2, "vocab-stringstore") Vocab, lexemes and the string store
|
+h(2, "vocab-stringstore") Vocab, lexemes and the string store
|
||||||
|
|
||||||
include _spacy-101/_vocab-stringstore
|
include _spacy-101/_vocab-stringstore
|
||||||
|
@ -121,6 +151,11 @@ include _spacy-101/_vocab-stringstore
|
||||||
|
|
||||||
include _spacy-101/_serialization
|
include _spacy-101/_serialization
|
||||||
|
|
||||||
|
+infobox
|
||||||
|
| To learn more about #[strong serialization] and how to
|
||||||
|
| #[strong save and load your own models], see the usage guide on
|
||||||
|
| #[+a("/docs/usage/saving-loading") saving, loading and data serialization].
|
||||||
|
|
||||||
+h(2, "training") Training
|
+h(2, "training") Training
|
||||||
|
|
||||||
include _spacy-101/_training
|
include _spacy-101/_training
|
||||||
|
|
|
@ -23,7 +23,6 @@ p
|
||||||
include _spacy-101/_similarity
|
include _spacy-101/_similarity
|
||||||
include _spacy-101/_word-vectors
|
include _spacy-101/_word-vectors
|
||||||
|
|
||||||
|
|
||||||
+h(2, "custom") Customising word vectors
|
+h(2, "custom") Customising word vectors
|
||||||
|
|
||||||
p
|
p
|
||||||
|
@ -31,33 +30,9 @@ p
|
||||||
| vector for its underlying #[+api("lexeme") #[code Lexeme]], while
|
| vector for its underlying #[+api("lexeme") #[code Lexeme]], while
|
||||||
| #[+api("doc#vector") #[code Doc.vector]] and
|
| #[+api("doc#vector") #[code Doc.vector]] and
|
||||||
| #[+api("span#vector") #[code Span.vector]] return an average of the
|
| #[+api("span#vector") #[code Span.vector]] return an average of the
|
||||||
| vectors of their tokens.
|
| vectors of their tokens. You can customize these
|
||||||
|
|
||||||
p
|
|
||||||
| You can customize these
|
|
||||||
| behaviours by modifying the #[code doc.user_hooks],
|
| behaviours by modifying the #[code doc.user_hooks],
|
||||||
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
|
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
|
||||||
| dictionaries.
|
| dictionaries.
|
||||||
|
|
||||||
+code("Example").
|
|
||||||
# TODO
|
|
||||||
|
|
||||||
p
|
|
||||||
| You can load new word vectors from a file-like buffer using the
|
|
||||||
| #[code vocab.load_vectors()] method. The file should be a
|
|
||||||
| whitespace-delimited text file, where the word is in the first column,
|
|
||||||
| and subsequent columns provide the vector data. For faster loading, you
|
|
||||||
| can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a
|
|
||||||
| path to a binary file written by #[code vocab.dump_vectors()].
|
|
||||||
|
|
||||||
+code("Example").
|
|
||||||
# TODO
|
|
||||||
|
|
||||||
p
|
|
||||||
| You can also load vectors from memory by writing to the
|
|
||||||
| #[+api("lexeme#vector") #[code Lexeme.vector]] property. If the vectors
|
|
||||||
| you are writing are of different dimensionality
|
|
||||||
| from the ones currently loaded, you should first call
|
|
||||||
| #[code vocab.resize_vectors(new_size)].
|
|
||||||
|
|
||||||
+h(2, "similarity") Similarity
|
+h(2, "similarity") Similarity
|
||||||
|
|
Loading…
Reference in New Issue
Block a user