From f122d82f290a95cb972a392c401ea04d163b0930 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 26 May 2017 13:17:48 +0200 Subject: [PATCH] Update usage docs and ddd "under construction" --- website/_includes/_mixins-base.jade | 11 +++ website/docs/usage/_spacy-101/_training.jade | 2 +- website/docs/usage/adding-languages.jade | 5 +- website/docs/usage/deep-learning.jade | 6 +- website/docs/usage/production-use.jade | 30 ++++---- website/docs/usage/spacy-101.jade | 4 ++ website/docs/usage/training-ner.jade | 70 +++++++++---------- website/docs/usage/training.jade | 56 --------------- website/docs/usage/visualizers.jade | 2 +- .../docs/usage/word-vectors-similarities.jade | 4 ++ 10 files changed, 78 insertions(+), 112 deletions(-) diff --git a/website/_includes/_mixins-base.jade b/website/_includes/_mixins-base.jade index c6132df74..80d63353d 100644 --- a/website/_includes/_mixins-base.jade +++ b/website/_includes/_mixins-base.jade @@ -186,3 +186,14 @@ mixin landing-header() mixin landing-badge(url, graphic, alt, size) +a(url)(aria-label=alt title=alt).c-landing__badge +svg("graphics", graphic, size || 225) + + +//- Under construction (temporary) + Marks sections that still need to be completed for the v2.0 release. + +mixin under-construction() + +infobox("🚧 Under construction") + | This section is still being written and will be updated for the v2.0 + | release. Is there anything that you think should definitely mentioned or + | explained here? Any examples you'd like to see? #[strong Let us know] + | on the #[+a(gh("spacy") + "/issues") v2.0 alpha thread] on GitHub! diff --git a/website/docs/usage/_spacy-101/_training.jade b/website/docs/usage/_spacy-101/_training.jade index 59861434c..f4a0c7194 100644 --- a/website/docs/usage/_spacy-101/_training.jade +++ b/website/docs/usage/_spacy-101/_training.jade @@ -1,3 +1,3 @@ //- 💫 DOCS > USAGE > SPACY 101 > TRAINING -p ++under-construction diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index ae04aad57..cd1fc4199 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -107,7 +107,6 @@ p .u-text-right +button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic - +table(["File name", "Variables", "Description"]) +row +cell #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py] @@ -439,7 +438,7 @@ p +h(3, "morph-rules") Morph rules -//- TODO: write morph rules section ++under-construction +h(2, "testing") Testing the new language tokenizer @@ -631,7 +630,7 @@ p | trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim]. | The #[code vectors.bin] file should consist of one word and vector per line. -+aside-code("your_data_directory", "yaml"). +//-+aside-code("your_data_directory", "yaml"). ├── vocab/ | ├── lexemes.bin | ├── strings.json diff --git a/website/docs/usage/deep-learning.jade b/website/docs/usage/deep-learning.jade index fec01b4ba..18f33c900 100644 --- a/website/docs/usage/deep-learning.jade +++ b/website/docs/usage/deep-learning.jade @@ -17,6 +17,8 @@ p | #[+a("http://deeplearning.net/software/theano/") Theano] is also | supported. ++under-construction + +code("Runtime usage"). def count_entity_sentiment(nlp, texts): '''Compute the net document sentiment for each entity in the texts.''' @@ -153,7 +155,9 @@ p | adding another LSTM layer, using attention mechanism, using character | features, etc. -+h(2, "attribute-hooks") Attribute hooks (experimental) ++h(2, "attribute-hooks") Attribute hooks + ++under-construction p | Earlier, we saw how to store data in the new generic #[code user_data] diff --git a/website/docs/usage/production-use.jade b/website/docs/usage/production-use.jade index c7f872c6d..e9fd4a30f 100644 --- a/website/docs/usage/production-use.jade +++ b/website/docs/usage/production-use.jade @@ -2,16 +2,18 @@ include ../../_includes/_mixins ++under-construction + +h(2, "multithreading") Multi-threading with #[code .pipe()] p | If you have a sequence of documents to process, you should use the - | #[+api("language#pipe") #[code .pipe()]] method. The method takes an - | iterator of texts, and accumulates an internal buffer, + | #[+api("language#pipe") #[code Language.pipe()]] method. The method takes + | an iterator of texts, and accumulates an internal buffer, | which it works on in parallel. It then yields the documents in order, | one-by-one. After a long and bitter struggle, the global interpreter | lock was freed around spaCy's main parsing loop in v0.100.3. This means - | that the #[code .pipe()] method will be significantly faster in most + | that #[code .pipe()] will be significantly faster in most | practical situations, because it allows shared memory parallelism. +code. @@ -20,23 +22,27 @@ p p | To make full use of the #[code .pipe()] function, you might want to - | brush up on Python generators. Here are a few quick hints: + | brush up on #[strong Python generators]. Here are a few quick hints: +list +item - | Generator comprehensions can be written - | (#[code item for item in sequence]) + | Generator comprehensions can be written as + | #[code (item for item in sequence)]. +item - | The #[code itertools] built-in library and the #[code cytoolz] - | package provide a lot of handy generator tools + | The + | #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library] + | and the + | #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package] + | provide a lot of handy #[strong generator tools]. +item | Often you'll have an input stream that pairs text with some - | important metadata, e.g. a JSON document. To pair up the metadata - | with the processed #[code Doc] object, you should use the tee - | function to split the generator in two, and then #[code izip] the - | extra stream to the document stream. + | important meta data, e.g. a JSON document. To + | #[strong pair up the meta data] with the processed #[code Doc] + | object, you should use the #[code itertools.tee] function to split + | the generator in two, and then #[code izip] the extra stream to the + | document stream. +h(2, "own-annotations") Bringing your own annotations diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index 24690af57..7c6525004 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -4,6 +4,8 @@ include ../../_includes/_mixins +h(2, "features") Features ++under-construction + +aside | If one of spaCy's functionalities #[strong needs a model], it means that | you need to have one our the available @@ -162,6 +164,8 @@ include _spacy-101/_training +h(2, "architecture") Architecture ++under-construction + +image include ../../assets/img/docs/architecture.svg .u-text-right diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade index 8b8789485..4faa47675 100644 --- a/website/docs/usage/training-ner.jade +++ b/website/docs/usage/training-ner.jade @@ -64,44 +64,10 @@ p | predicts the new category with minimal difference from the previous | output. -+h(2, "saving-loading") Saving and loading - -p - | After training our model, you'll usually want to save its state, and load - | it back later. You can do this with the #[code Language.save_to_directory()] - | method: - -+code. - nlp.save_to_directory('/home/me/data/en_technology') - -p - | To make the model more convenient to deploy, we recommend wrapping it as - | a Python package, so that you can install it via pip and load it as a - | module. spaCy comes with a handy #[+api("cli#package") #[code package]] - | CLI command to create all required files and directories. - -+code(false, "bash"). - python -m spacy package /home/me/data/en_technology /home/me/my_models - -p - | To build the package and create a #[code .tar.gz] archive, run - | #[code python setup.py sdist] from within its directory. - -+infobox("Saving and loading models") - | For more information and a detailed guide on how to package your model, - | see the documentation on - | #[+a("/docs/usage/saving-loading") saving and loading models]. - -p - | After you've generated and installed the package, you'll be able to - | load the model as follows: - -+code. - import en_technology - nlp = en_technology.load() - +h(2, "example") Example: Adding and training an #[code ANIMAL] entity ++under-construction + p | This script shows how to add a new entity type to an existing pre-trained | NER model. To keep the example short and simple, only four sentences are @@ -170,5 +136,33 @@ p p | After training your model, you can - | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping - | models as Python packages, for ease of deployment. + | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend + | wrapping models as Python packages, for ease of deployment. + ++h(2, "saving-loading") Saving and loading + +p + | After training our model, you'll usually want to save its state, and load + | it back later. You can do this with the + | #[+api("language#to_disk") #[code Language.to_disk()]] method: + ++code. + nlp.to_disk('/home/me/data/en_technology') + +p + | To make the model more convenient to deploy, we recommend wrapping it as + | a Python package, so that you can install it via pip and load it as a + | module. spaCy comes with a handy #[+api("cli#package") #[code package]] + | CLI command to create all required files and directories. + ++code(false, "bash"). + python -m spacy package /home/me/data/en_technology /home/me/my_models + +p + | To build the package and create a #[code .tar.gz] archive, run + | #[code python setup.py sdist] from within its directory. + ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading#models") saving and loading models]. diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 9df71851a..6c6c17e17 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -81,59 +81,3 @@ p.o-inline-list p +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example - -+h(2, "feature-templates") Customizing the feature extraction - -p - | spaCy currently uses linear models for the tagger, parser and entity - | recognizer, with weights learned using the - | #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm]. - -+aside("Linear Model Feature Scheme") - | For a list of the available feature atoms, see the #[+a("/docs/api/features") Linear Model Feature Scheme]. - -p - | Because it's a linear model, it's important for accuracy to build - | conjunction features out of the atomic predictors. Let's say you have - | two atomic predictors asking, "What is the part-of-speech of the - | previous token?", and "What is the part-of-speech of the previous - | previous token?". These predictors will introduce a number of features, - | e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction - | template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ]. - -p - | The feature extraction proceeds in two passes. In the first pass, we - | fill an array with the values of all of the atomic predictors. In the - | second pass, we iterate over the feature templates, and fill a small - | temporary array with the predictors that will be combined into a - | conjunction feature. Finally, we hash this array into a 64-bit integer, - | using the MurmurHash algorithm. You can see this at work in the - | #[+a(gh("thinc", "thinc/linear/features.pyx", "94dbe06fd3c8f24d86ab0f5c7984e52dbfcdc6cb")) #[code thinc.linear.features]] module. - -p - | It's very easy to change the feature templates, to create novel - | combinations of the existing atomic predictors. There's currently no API - | available to add new atomic predictors, though. You'll have to create a - | subclass of the model, and write your own #[code set_featuresC] method. - -p - | The feature templates are passed in using the #[code features] keyword - | argument to the constructors of the #[+api("tagger") #[code Tagger]], - | #[+api("dependencyparser") #[code DependencyParser]] and - | #[+api("entityrecognizer") #[code EntityRecognizer]]: - -+code. - from spacy.vocab import Vocab - from spacy.pipeline import Tagger - from spacy.tagger import P2_orth, P1_orth - from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth - - vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}}) - tagger = Tagger(vocab, features=[(P2_orth, P2_cluster), (P1_orth, P1_cluster), - (P2_orth,), (P1_orth,), (W_orth,), - (N1_orth,), (N2_orth,)]) - -p - | Custom feature templates can be passed to the #[code DependencyParser] - | and #[code EntityRecognizer] as well, also using the #[code features] - | keyword argument of the constructor. diff --git a/website/docs/usage/visualizers.jade b/website/docs/usage/visualizers.jade index 90a343700..186fc5db3 100644 --- a/website/docs/usage/visualizers.jade +++ b/website/docs/usage/visualizers.jade @@ -334,7 +334,7 @@ p | token #[code <script src="malicious-code.js"><script>]. | Instead of relying on the server to render and sanitize HTML, you | can do this on the client in JavaScript. displaCy.js creates - | the SVG markup as DOM nodes and will never insert raw HTML. + | the markup as DOM nodes and will never insert raw HTML. p | The #[code parse_deps] function takes a #[code Doc] object and returns diff --git a/website/docs/usage/word-vectors-similarities.jade b/website/docs/usage/word-vectors-similarities.jade index eecb268b6..e5935cfb6 100644 --- a/website/docs/usage/word-vectors-similarities.jade +++ b/website/docs/usage/word-vectors-similarities.jade @@ -25,6 +25,8 @@ include _spacy-101/_word-vectors +h(2, "custom") Customising word vectors ++under-construction + p | By default, #[+api("token#vector") #[code Token.vector]] returns the | vector for its underlying #[+api("lexeme") #[code Lexeme]], while @@ -36,3 +38,5 @@ p | dictionaries. +h(2, "similarity") Similarity + ++under-construction