mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Update usage docs and ddd "under construction"
This commit is contained in:
parent
286c3d0719
commit
f122d82f29
|
@ -186,3 +186,14 @@ mixin landing-header()
|
|||
mixin landing-badge(url, graphic, alt, size)
|
||||
+a(url)(aria-label=alt title=alt).c-landing__badge
|
||||
+svg("graphics", graphic, size || 225)
|
||||
|
||||
|
||||
//- Under construction (temporary)
|
||||
Marks sections that still need to be completed for the v2.0 release.
|
||||
|
||||
mixin under-construction()
|
||||
+infobox("🚧 Under construction")
|
||||
| This section is still being written and will be updated for the v2.0
|
||||
| release. Is there anything that you think should definitely mentioned or
|
||||
| explained here? Any examples you'd like to see? #[strong Let us know]
|
||||
| on the #[+a(gh("spacy") + "/issues") v2.0 alpha thread] on GitHub!
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
//- 💫 DOCS > USAGE > SPACY 101 > TRAINING
|
||||
|
||||
p
|
||||
+under-construction
|
||||
|
|
|
@ -107,7 +107,6 @@ p
|
|||
.u-text-right
|
||||
+button("/assets/img/docs/language_data.svg", false, "secondary").u-text-tag View large graphic
|
||||
|
||||
|
||||
+table(["File name", "Variables", "Description"])
|
||||
+row
|
||||
+cell #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py]
|
||||
|
@ -439,7 +438,7 @@ p
|
|||
|
||||
+h(3, "morph-rules") Morph rules
|
||||
|
||||
//- TODO: write morph rules section
|
||||
+under-construction
|
||||
|
||||
+h(2, "testing") Testing the new language tokenizer
|
||||
|
||||
|
@ -631,7 +630,7 @@ p
|
|||
| trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim].
|
||||
| The #[code vectors.bin] file should consist of one word and vector per line.
|
||||
|
||||
+aside-code("your_data_directory", "yaml").
|
||||
//-+aside-code("your_data_directory", "yaml").
|
||||
├── vocab/
|
||||
| ├── lexemes.bin
|
||||
| ├── strings.json
|
||||
|
|
|
@ -17,6 +17,8 @@ p
|
|||
| #[+a("http://deeplearning.net/software/theano/") Theano] is also
|
||||
| supported.
|
||||
|
||||
+under-construction
|
||||
|
||||
+code("Runtime usage").
|
||||
def count_entity_sentiment(nlp, texts):
|
||||
'''Compute the net document sentiment for each entity in the texts.'''
|
||||
|
@ -153,7 +155,9 @@ p
|
|||
| adding another LSTM layer, using attention mechanism, using character
|
||||
| features, etc.
|
||||
|
||||
+h(2, "attribute-hooks") Attribute hooks (experimental)
|
||||
+h(2, "attribute-hooks") Attribute hooks
|
||||
|
||||
+under-construction
|
||||
|
||||
p
|
||||
| Earlier, we saw how to store data in the new generic #[code user_data]
|
||||
|
|
|
@ -2,16 +2,18 @@
|
|||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
+under-construction
|
||||
|
||||
+h(2, "multithreading") Multi-threading with #[code .pipe()]
|
||||
|
||||
p
|
||||
| If you have a sequence of documents to process, you should use the
|
||||
| #[+api("language#pipe") #[code .pipe()]] method. The method takes an
|
||||
| iterator of texts, and accumulates an internal buffer,
|
||||
| #[+api("language#pipe") #[code Language.pipe()]] method. The method takes
|
||||
| an iterator of texts, and accumulates an internal buffer,
|
||||
| which it works on in parallel. It then yields the documents in order,
|
||||
| one-by-one. After a long and bitter struggle, the global interpreter
|
||||
| lock was freed around spaCy's main parsing loop in v0.100.3. This means
|
||||
| that the #[code .pipe()] method will be significantly faster in most
|
||||
| that #[code .pipe()] will be significantly faster in most
|
||||
| practical situations, because it allows shared memory parallelism.
|
||||
|
||||
+code.
|
||||
|
@ -20,23 +22,27 @@ p
|
|||
|
||||
p
|
||||
| To make full use of the #[code .pipe()] function, you might want to
|
||||
| brush up on Python generators. Here are a few quick hints:
|
||||
| brush up on #[strong Python generators]. Here are a few quick hints:
|
||||
|
||||
+list
|
||||
+item
|
||||
| Generator comprehensions can be written
|
||||
| (#[code item for item in sequence])
|
||||
| Generator comprehensions can be written as
|
||||
| #[code (item for item in sequence)].
|
||||
|
||||
+item
|
||||
| The #[code itertools] built-in library and the #[code cytoolz]
|
||||
| package provide a lot of handy generator tools
|
||||
| The
|
||||
| #[+a("https://docs.python.org/2/library/itertools.html") #[code itertools] built-in library]
|
||||
| and the
|
||||
| #[+a("https://github.com/pytoolz/cytoolz") #[code cytoolz] package]
|
||||
| provide a lot of handy #[strong generator tools].
|
||||
|
||||
+item
|
||||
| Often you'll have an input stream that pairs text with some
|
||||
| important metadata, e.g. a JSON document. To pair up the metadata
|
||||
| with the processed #[code Doc] object, you should use the tee
|
||||
| function to split the generator in two, and then #[code izip] the
|
||||
| extra stream to the document stream.
|
||||
| important meta data, e.g. a JSON document. To
|
||||
| #[strong pair up the meta data] with the processed #[code Doc]
|
||||
| object, you should use the #[code itertools.tee] function to split
|
||||
| the generator in two, and then #[code izip] the extra stream to the
|
||||
| document stream.
|
||||
|
||||
+h(2, "own-annotations") Bringing your own annotations
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@ include ../../_includes/_mixins
|
|||
|
||||
+h(2, "features") Features
|
||||
|
||||
+under-construction
|
||||
|
||||
+aside
|
||||
| If one of spaCy's functionalities #[strong needs a model], it means that
|
||||
| you need to have one our the available
|
||||
|
@ -162,6 +164,8 @@ include _spacy-101/_training
|
|||
|
||||
+h(2, "architecture") Architecture
|
||||
|
||||
+under-construction
|
||||
|
||||
+image
|
||||
include ../../assets/img/docs/architecture.svg
|
||||
.u-text-right
|
||||
|
|
|
@ -64,44 +64,10 @@ p
|
|||
| predicts the new category with minimal difference from the previous
|
||||
| output.
|
||||
|
||||
+h(2, "saving-loading") Saving and loading
|
||||
|
||||
p
|
||||
| After training our model, you'll usually want to save its state, and load
|
||||
| it back later. You can do this with the #[code Language.save_to_directory()]
|
||||
| method:
|
||||
|
||||
+code.
|
||||
nlp.save_to_directory('/home/me/data/en_technology')
|
||||
|
||||
p
|
||||
| To make the model more convenient to deploy, we recommend wrapping it as
|
||||
| a Python package, so that you can install it via pip and load it as a
|
||||
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
|
||||
| CLI command to create all required files and directories.
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy package /home/me/data/en_technology /home/me/my_models
|
||||
|
||||
p
|
||||
| To build the package and create a #[code .tar.gz] archive, run
|
||||
| #[code python setup.py sdist] from within its directory.
|
||||
|
||||
+infobox("Saving and loading models")
|
||||
| For more information and a detailed guide on how to package your model,
|
||||
| see the documentation on
|
||||
| #[+a("/docs/usage/saving-loading") saving and loading models].
|
||||
|
||||
p
|
||||
| After you've generated and installed the package, you'll be able to
|
||||
| load the model as follows:
|
||||
|
||||
+code.
|
||||
import en_technology
|
||||
nlp = en_technology.load()
|
||||
|
||||
+h(2, "example") Example: Adding and training an #[code ANIMAL] entity
|
||||
|
||||
+under-construction
|
||||
|
||||
p
|
||||
| This script shows how to add a new entity type to an existing pre-trained
|
||||
| NER model. To keep the example short and simple, only four sentences are
|
||||
|
@ -170,5 +136,33 @@ p
|
|||
|
||||
p
|
||||
| After training your model, you can
|
||||
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping
|
||||
| models as Python packages, for ease of deployment.
|
||||
| #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend
|
||||
| wrapping models as Python packages, for ease of deployment.
|
||||
|
||||
+h(2, "saving-loading") Saving and loading
|
||||
|
||||
p
|
||||
| After training our model, you'll usually want to save its state, and load
|
||||
| it back later. You can do this with the
|
||||
| #[+api("language#to_disk") #[code Language.to_disk()]] method:
|
||||
|
||||
+code.
|
||||
nlp.to_disk('/home/me/data/en_technology')
|
||||
|
||||
p
|
||||
| To make the model more convenient to deploy, we recommend wrapping it as
|
||||
| a Python package, so that you can install it via pip and load it as a
|
||||
| module. spaCy comes with a handy #[+api("cli#package") #[code package]]
|
||||
| CLI command to create all required files and directories.
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy package /home/me/data/en_technology /home/me/my_models
|
||||
|
||||
p
|
||||
| To build the package and create a #[code .tar.gz] archive, run
|
||||
| #[code python setup.py sdist] from within its directory.
|
||||
|
||||
+infobox("Saving and loading models")
|
||||
| For more information and a detailed guide on how to package your model,
|
||||
| see the documentation on
|
||||
| #[+a("/docs/usage/saving-loading#models") saving and loading models].
|
||||
|
|
|
@ -81,59 +81,3 @@ p.o-inline-list
|
|||
|
||||
p
|
||||
+button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example
|
||||
|
||||
+h(2, "feature-templates") Customizing the feature extraction
|
||||
|
||||
p
|
||||
| spaCy currently uses linear models for the tagger, parser and entity
|
||||
| recognizer, with weights learned using the
|
||||
| #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm].
|
||||
|
||||
+aside("Linear Model Feature Scheme")
|
||||
| For a list of the available feature atoms, see the #[+a("/docs/api/features") Linear Model Feature Scheme].
|
||||
|
||||
p
|
||||
| Because it's a linear model, it's important for accuracy to build
|
||||
| conjunction features out of the atomic predictors. Let's say you have
|
||||
| two atomic predictors asking, "What is the part-of-speech of the
|
||||
| previous token?", and "What is the part-of-speech of the previous
|
||||
| previous token?". These predictors will introduce a number of features,
|
||||
| e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction
|
||||
| template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ].
|
||||
|
||||
p
|
||||
| The feature extraction proceeds in two passes. In the first pass, we
|
||||
| fill an array with the values of all of the atomic predictors. In the
|
||||
| second pass, we iterate over the feature templates, and fill a small
|
||||
| temporary array with the predictors that will be combined into a
|
||||
| conjunction feature. Finally, we hash this array into a 64-bit integer,
|
||||
| using the MurmurHash algorithm. You can see this at work in the
|
||||
| #[+a(gh("thinc", "thinc/linear/features.pyx", "94dbe06fd3c8f24d86ab0f5c7984e52dbfcdc6cb")) #[code thinc.linear.features]] module.
|
||||
|
||||
p
|
||||
| It's very easy to change the feature templates, to create novel
|
||||
| combinations of the existing atomic predictors. There's currently no API
|
||||
| available to add new atomic predictors, though. You'll have to create a
|
||||
| subclass of the model, and write your own #[code set_featuresC] method.
|
||||
|
||||
p
|
||||
| The feature templates are passed in using the #[code features] keyword
|
||||
| argument to the constructors of the #[+api("tagger") #[code Tagger]],
|
||||
| #[+api("dependencyparser") #[code DependencyParser]] and
|
||||
| #[+api("entityrecognizer") #[code EntityRecognizer]]:
|
||||
|
||||
+code.
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.pipeline import Tagger
|
||||
from spacy.tagger import P2_orth, P1_orth
|
||||
from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth
|
||||
|
||||
vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
|
||||
tagger = Tagger(vocab, features=[(P2_orth, P2_cluster), (P1_orth, P1_cluster),
|
||||
(P2_orth,), (P1_orth,), (W_orth,),
|
||||
(N1_orth,), (N2_orth,)])
|
||||
|
||||
p
|
||||
| Custom feature templates can be passed to the #[code DependencyParser]
|
||||
| and #[code EntityRecognizer] as well, also using the #[code features]
|
||||
| keyword argument of the constructor.
|
||||
|
|
|
@ -334,7 +334,7 @@ p
|
|||
| token #[code <script src="malicious-code.js"><script>].
|
||||
| Instead of relying on the server to render and sanitize HTML, you
|
||||
| can do this on the client in JavaScript. displaCy.js creates
|
||||
| the SVG markup as DOM nodes and will never insert raw HTML.
|
||||
| the markup as DOM nodes and will never insert raw HTML.
|
||||
|
||||
p
|
||||
| The #[code parse_deps] function takes a #[code Doc] object and returns
|
||||
|
|
|
@ -25,6 +25,8 @@ include _spacy-101/_word-vectors
|
|||
|
||||
+h(2, "custom") Customising word vectors
|
||||
|
||||
+under-construction
|
||||
|
||||
p
|
||||
| By default, #[+api("token#vector") #[code Token.vector]] returns the
|
||||
| vector for its underlying #[+api("lexeme") #[code Lexeme]], while
|
||||
|
@ -36,3 +38,5 @@ p
|
|||
| dictionaries.
|
||||
|
||||
+h(2, "similarity") Similarity
|
||||
|
||||
+under-construction
|
||||
|
|
Loading…
Reference in New Issue
Block a user