mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 18:36:36 +03:00
43 lines
1.6 KiB
Plaintext
43 lines
1.6 KiB
Plaintext
//- 💫 DOCS > USAGE > WORD VECTORS & SIMILARITIES
|
|
|
|
include ../../_includes/_mixins
|
|
|
|
p
|
|
| Dense, real valued vectors representing distributional similarity
|
|
| information are now a cornerstone of practical NLP. The most common way
|
|
| to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
|
|
| family of algorithms. The default
|
|
| #[+a("/docs/usage/models#available") English model] installs
|
|
| 300-dimensional vectors trained on the Common Crawl
|
|
| corpus using the #[+a("http://nlp.stanford.edu/projects/glove/") GloVe]
|
|
| algorithm. The GloVe common crawl vectors have become a de facto
|
|
| standard for practical NLP.
|
|
|
|
+aside("Tip: Training a word2vec model")
|
|
| If you need to train a word2vec model, we recommend the implementation in
|
|
| the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
|
|
|
|
+h(2, "101") Similarity and word vectors 101
|
|
+tag-model("vectors")
|
|
|
|
include _spacy-101/_similarity
|
|
include _spacy-101/_word-vectors
|
|
|
|
+h(2, "custom") Customising word vectors
|
|
|
|
+under-construction
|
|
|
|
p
|
|
| By default, #[+api("token#vector") #[code Token.vector]] returns the
|
|
| vector for its underlying #[+api("lexeme") #[code Lexeme]], while
|
|
| #[+api("doc#vector") #[code Doc.vector]] and
|
|
| #[+api("span#vector") #[code Span.vector]] return an average of the
|
|
| vectors of their tokens. You can customize these
|
|
| behaviours by modifying the #[code doc.user_hooks],
|
|
| #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
|
|
| dictionaries.
|
|
|
|
+h(2, "similarity") Similarity
|
|
|
|
+under-construction
|