From 286c3d0719e28110f4d27b75a44f87d20ed00de4 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 12:46:29 +0200 Subject: [PATCH] Update usage and 101 docs --- website/docs/usage/_data.json | 2 +- website/docs/usage/_spacy-101/_pipelines.jade | 10 ++++++ .../docs/usage/_spacy-101/_serialization.jade | 28 +++++++++++++++ .../usage/language-processing-pipeline.jade | 5 +-- website/docs/usage/lightning-tour.jade | 2 +- website/docs/usage/spacy-101.jade | 35 +++++++++++++++++++ .../docs/usage/word-vectors-similarities.jade | 27 +------------- 7 files changed, 79 insertions(+), 30 deletions(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index a611151b3..59057b0bb 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -80,7 +80,7 @@ }, "customizing-tokenizer": { - "title": "Customizing the tokenizer", + "title": "Customising the tokenizer", "next": "rule-based-matching" }, diff --git a/website/docs/usage/_spacy-101/_pipelines.jade b/website/docs/usage/_spacy-101/_pipelines.jade index db095ef04..edf553805 100644 --- a/website/docs/usage/_spacy-101/_pipelines.jade +++ b/website/docs/usage/_spacy-101/_pipelines.jade @@ -48,3 +48,13 @@ p +cell ner +cell #[+api("entityrecognizer") #[code EntityRecognizer]] +cell #[code Doc.ents], #[code Doc[i].ent_iob], #[code Doc[i].ent_type] + +p + | The processing pipeline always #[strong depends on the statistical model] + | and its capabilities. For example, a pipeline can only include an entity + | recognizer component if the model includes data to make predictions of + | entity labels. This is why each model will specify the pipeline to use + | in its meta data, as a simple list containing the component names: + ++code(false, "json"). + "pipeline": ["vectorizer", "tagger", "parser", "ner"] diff --git a/website/docs/usage/_spacy-101/_serialization.jade b/website/docs/usage/_spacy-101/_serialization.jade index f3926dd9c..35d931634 100644 --- a/website/docs/usage/_spacy-101/_serialization.jade +++ b/website/docs/usage/_spacy-101/_serialization.jade @@ -34,7 +34,35 @@ p +annotation-row(["to_disk", "-", "nlp.to_disk('/path')"], style) +annotation-row(["from_disk", "object", "nlp.from_disk('/path')"], style) +p + | For example, if you've processed a very large document, you can use + | #[+api("doc#to_disk") #[code Doc.to_disk]] to save it to a file on your + | local machine. This will save the document and its tokens, as well as + | the vocabulary associated with the #[code Doc]. + ++aside("Why saving the vocab?") + | Saving the vocabulary with the #[code Doc] is important, because the + | #[code Vocab] holds the context-independent information about the words, + | tags and labels, and their #[strong integer IDs]. If the #[code Vocab] + | wasn't saved with the #[code Doc], spaCy wouldn't know how to resolve + | those IDs – for example, the word text or the dependency labels. You + | might be saving #[code 446] for "whale", but in a different vocabulary, + | this ID could map to "VERB". Similarly, if your document was processed by + | a German model, its vocab will include the specific + | #[+a("/docs/api/annotation#dependency-parsing-german") German dependency labels]. + +code. moby_dick = open('moby_dick.txt', 'r') # open a large document doc = nlp(moby_dick) # process it doc.to_disk('/moby_dick.bin') # save the processed Doc + +p + | If you need it again later, you can load it back into an empty #[code Doc] + | with an empty #[code Vocab] by calling + | #[+api("doc#from_disk") #[code from_disk()]]: + ++code. + from spacy.tokens import Doc # to create empty Doc + from spacy.vocab import Vocab # to create empty Vocab + + doc = Doc(Vocab()).from_disk('/moby_dick.bin') # load processed Doc diff --git a/website/docs/usage/language-processing-pipeline.jade b/website/docs/usage/language-processing-pipeline.jade index 948212d82..ce23a1666 100644 --- a/website/docs/usage/language-processing-pipeline.jade +++ b/website/docs/usage/language-processing-pipeline.jade @@ -322,8 +322,9 @@ p | If you don't need a particular component of the pipeline – for | example, the tagger or the parser, you can disable loading it. This can | sometimes make a big difference and improve loading speed. Disabled - | component names can be provided to #[code spacy.load], #[code from_disk] - | or the #[code nlp] object itself as a list: + | component names can be provided to #[+api("spacy#load") #[code spacy.load]], + | #[+api("language#from_disk") #[code Language.from_disk]] or the + | #[code nlp] object itself as a list: +code. nlp = spacy.load('en', disable['parser', 'tagger']) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 473f10c5e..4a9a2315f 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -35,7 +35,7 @@ p assert doc[0].text == u'Peach' assert doc[1].text == u'emoji' assert doc[-1].text == u'🍑' - assert doc[17:19] == u'outranking eggplant' + assert doc[17:19].text == u'outranking eggplant' assert doc.noun_chunks[0].text == u'Peach emoji' sentences = list(doc.sents) diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index cdeeac8bf..24690af57 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -91,17 +91,35 @@ p include _spacy-101/_tokenization ++infobox + | To learn more about how spaCy's tokenizer and its rules work in detail, + | how to #[strong customise] it and how to #[strong add your own tokenizer] + | to a processing pipeline, see the usage guide on + | #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer]. +h(3, "annotations-pos-deps") Part-of-speech tags and dependencies +tag-model("dependency parse") include _spacy-101/_pos-deps ++infobox + | To learn more about #[strong part-of-speech tagging] and rule-based + | morphology, and how to #[strong navigate and use the parse tree] + | effectively, see the usage guides on + | #[+a("/docs/usage/pos-tagging") part-of-speech tagging] and + | #[+a("/docs/usage/dependency-parse") using the dependency parse]. + +h(3, "annotations-ner") Named Entities +tag-model("named entities") include _spacy-101/_named-entities ++infobox + | To learn more about entity recognition in spaCy, how to + | #[strong add your own entities] to a document and how to train and update + | the entity predictions of a model, see the usage guide on + | #[+a("/docs/usage/entity-recognition") named entity recognition]. + +h(2, "vectors-similarity") Word vectors and similarity +tag-model("vectors") @@ -109,10 +127,22 @@ include _spacy-101/_similarity include _spacy-101/_word-vectors ++infobox + | To learn more about word vectors, how to #[strong customise them] and + | how to load #[strong your own vectors] into spaCy, see the usage + | guide on + | #[+a("/docs/usage/word-vectors-similarities") using word vectors and semantic similarities]. + +h(2, "pipelines") Pipelines include _spacy-101/_pipelines ++infobox + | To learn more about #[strong how processing pipelines work] in detail, + | how to enable and disable their components, and how to + | #[strong create your own], see the usage guide on + | #[+a("/docs/usage/language-processing-pipeline") language processing pipelines]. + +h(2, "vocab-stringstore") Vocab, lexemes and the string store include _spacy-101/_vocab-stringstore @@ -121,6 +151,11 @@ include _spacy-101/_vocab-stringstore include _spacy-101/_serialization ++infobox + | To learn more about #[strong serialization] and how to + | #[strong save and load your own models], see the usage guide on + | #[+a("/docs/usage/saving-loading") saving, loading and data serialization]. + +h(2, "training") Training include _spacy-101/_training diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade index 00e200f59..eecb268b6 100644 --- a/website/docs/usage/word-vectors-similarities.jade +++ b/website/docs/usage/word-vectors-similarities.jade @@ -23,7 +23,6 @@ p include _spacy-101/_similarity include _spacy-101/_word-vectors - +h(2, "custom") Customising word vectors p @@ -31,33 +30,9 @@ p | vector for its underlying #[+api("lexeme") #[code Lexeme]], while | #[+api("doc#vector") #[code Doc.vector]] and | #[+api("span#vector") #[code Span.vector]] return an average of the - | vectors of their tokens. - -p - | You can customize these + | vectors of their tokens. You can customize these | behaviours by modifying the #[code doc.user_hooks], | #[code doc.user_span_hooks] and #[code doc.user_token_hooks] | dictionaries. -+code("Example"). - # TODO - -p - | You can load new word vectors from a file-like buffer using the - | #[code vocab.load_vectors()] method. The file should be a - | whitespace-delimited text file, where the word is in the first column, - | and subsequent columns provide the vector data. For faster loading, you - | can use the #[code vocab.vectors_from_bin_loc()] method, which accepts a - | path to a binary file written by #[code vocab.dump_vectors()]. - -+code("Example"). - # TODO - -p - | You can also load vectors from memory by writing to the - | #[+api("lexeme#vector") #[code Lexeme.vector]] property. If the vectors - | you are writing are of different dimensionality - | from the ones currently loaded, you should first call - | #[code vocab.resize_vectors(new_size)]. - +h(2, "similarity") Similarity