mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			43 lines
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			43 lines
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //- 💫 DOCS > USAGE > WORD VECTORS & SIMILARITIES
 | |
| 
 | |
| include ../../_includes/_mixins
 | |
| 
 | |
| p
 | |
|     |  Dense, real valued vectors representing distributional similarity
 | |
|     |  information are now a cornerstone of practical NLP. The most common way
 | |
|     |  to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
 | |
|     |  family of algorithms. The default
 | |
|     |  #[+a("/docs/usage/models#available") English model] installs
 | |
|     |  300-dimensional vectors trained on the Common Crawl
 | |
|     |  corpus using the #[+a("http://nlp.stanford.edu/projects/glove/") GloVe]
 | |
|     |  algorithm. The GloVe common crawl vectors have become a de facto
 | |
|     |  standard for practical NLP.
 | |
| 
 | |
| +aside("Tip: Training a word2vec model")
 | |
|     |  If you need to train a word2vec model, we recommend the implementation in
 | |
|     |  the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
 | |
| 
 | |
| +h(2, "101") Similarity and word vectors 101
 | |
|     +tag-model("vectors")
 | |
| 
 | |
| include _spacy-101/_similarity
 | |
| include _spacy-101/_word-vectors
 | |
| 
 | |
| +h(2, "custom") Customising word vectors
 | |
| 
 | |
| +under-construction
 | |
| 
 | |
| p
 | |
|     |  By default, #[+api("token#vector") #[code Token.vector]] returns the
 | |
|     |  vector for its underlying #[+api("lexeme") #[code Lexeme]], while
 | |
|     |  #[+api("doc#vector") #[code Doc.vector]] and
 | |
|     |  #[+api("span#vector") #[code Span.vector]] return an average of the
 | |
|     |  vectors of their tokens. You can customize these
 | |
|     |  behaviours by modifying the #[code doc.user_hooks],
 | |
|     |  #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
 | |
|     |  dictionaries.
 | |
| 
 | |
| +h(2, "similarity") Similarity
 | |
| 
 | |
| +under-construction
 |