mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						fdb4b8e456
					
				|  | @ -1,7 +1,6 @@ | |||
| #!/usr/bin/env python | ||||
| # coding: utf8 | ||||
| """ | ||||
| A simple example of extracting relations between phrases and entities using | ||||
| """A simple example of extracting relations between phrases and entities using | ||||
| spaCy's named entity recognizer and the dependency parse. Here, we extract | ||||
| money and currency values (entities labelled as MONEY) and then check the | ||||
| dependency tree to find the noun phrase they are referring to – for example: | ||||
|  |  | |||
|  | @ -1,8 +1,7 @@ | |||
| #!/usr/bin/env python | ||||
| # coding: utf8 | ||||
| """ | ||||
| This example shows how to navigate the parse tree including subtrees attached | ||||
| to a word. | ||||
| """This example shows how to navigate the parse tree including subtrees | ||||
| attached to a word. | ||||
| 
 | ||||
| Based on issue #252: | ||||
| "In the documents and tutorials the main thing I haven't found is | ||||
|  |  | |||
|  | @ -1,9 +1,10 @@ | |||
| #!/usr/bin/env python | ||||
| # coding: utf8 | ||||
| """Match a large set of multi-word expressions in O(1) time. | ||||
| 
 | ||||
| The idea is to associate each word in the vocabulary with a tag, noting whether | ||||
| they begin, end, or are inside at least one pattern. An additional tag is used | ||||
| for single-word patterns. Complete patterns are also stored in a hash set. | ||||
| 
 | ||||
| When we process a document, we look up the words in the vocabulary, to | ||||
| associate the words with the tags.  We then search for tag-sequences that | ||||
| correspond to valid candidates. Finally, we look up the candidates in the hash | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| """ | ||||
| Example of multi-processing with Joblib. Here, we're exporting | ||||
| #!/usr/bin/env python | ||||
| # coding: utf8 | ||||
| """Example of multi-processing with Joblib. Here, we're exporting | ||||
| part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with | ||||
| each "sentence" on a newline, and spaces between tokens. Data is loaded from | ||||
| the IMDB movie reviews dataset and will be loaded automatically via Thinc's | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| #!/usr/bin/env python | ||||
| # coding: utf8 | ||||
| """ | ||||
| Example of training spaCy's named entity recognizer, starting off with an | ||||
| """Example of training spaCy's named entity recognizer, starting off with an | ||||
| existing model or a blank model. | ||||
| 
 | ||||
| For more details, see the documentation: | ||||
|  |  | |||
|  | @ -1,7 +1,6 @@ | |||
| #!/usr/bin/env python | ||||
| # coding: utf8 | ||||
| """ | ||||
| Example of training an additional entity type | ||||
| """Example of training an additional entity type | ||||
| 
 | ||||
| This script shows how to add a new entity type to an existing pre-trained NER | ||||
| model. To keep the example short and simple, only four sentences are provided | ||||
|  |  | |||
|  | @ -1,10 +1,7 @@ | |||
| #!/usr/bin/env python | ||||
| # coding: utf8 | ||||
| """ | ||||
| Example of training spaCy dependency parser, starting off with an existing model | ||||
| or a blank model. | ||||
| 
 | ||||
| For more details, see the documentation: | ||||
| """Example of training spaCy dependency parser, starting off with an existing | ||||
| model or a blank model. For more details, see the documentation: | ||||
| * Training: https://alpha.spacy.io/usage/training | ||||
| * Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse | ||||
| 
 | ||||
|  |  | |||
|  | @ -3,9 +3,8 @@ | |||
| """ | ||||
| A simple example for training a part-of-speech tagger with a custom tag map. | ||||
| To allow us to update the tag map with our custom one, this example starts off | ||||
| with a blank Language class and modifies its defaults. | ||||
| 
 | ||||
| For more details, see the documentation: | ||||
| with a blank Language class and modifies its defaults. For more details, see | ||||
| the documentation: | ||||
| * Training: https://alpha.spacy.io/usage/training | ||||
| * POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging | ||||
| 
 | ||||
|  |  | |||
|  | @ -3,9 +3,8 @@ | |||
| """Train a multi-label convolutional neural network text classifier on the | ||||
| IMDB dataset, using the TextCategorizer component. The dataset will be loaded | ||||
| automatically via Thinc's built-in dataset loader. The model is added to | ||||
| spacy.pipeline, and predictions are available via `doc.cats`. | ||||
| 
 | ||||
| For more details, see the documentation: | ||||
| spacy.pipeline, and predictions are available via `doc.cats`. For more details, | ||||
| see the documentation: | ||||
| * Training: https://alpha.spacy.io/usage/training | ||||
| * Text classification: https://alpha.spacy.io/usage/text-classification | ||||
| 
 | ||||
|  |  | |||
|  | @ -7,14 +7,13 @@ from __future__ import unicode_literals | |||
| import plac | ||||
| import numpy | ||||
| 
 | ||||
| import from spacy.language import Language | ||||
| from spacy.language import Language | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|     vectors_loc=("Path to vectors", "positional", None, str)) | ||||
| def main(vectors_loc): | ||||
|     nlp = Language() | ||||
| 
 | ||||
|     nlp = Language()  # start off with a blank Language class | ||||
|     with open(vectors_loc, 'rb') as file_: | ||||
|         header = file_.readline() | ||||
|         nr_row, nr_dim = header.split() | ||||
|  | @ -24,9 +23,11 @@ def main(vectors_loc): | |||
|             pieces = line.split() | ||||
|             word = pieces[0] | ||||
|             vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') | ||||
|             nlp.vocab.set_vector(word, vector) | ||||
|     doc = nlp(u'class colspan') | ||||
|     print(doc[0].similarity(doc[1])) | ||||
|             nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab | ||||
|     # test the vectors and similarity | ||||
|     text = 'class colspan' | ||||
|     doc = nlp(text) | ||||
|     print(text, doc[0].similarity(doc[1])) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|  |  | |||
|  | @ -99,7 +99,8 @@ def generate_meta(model_path, existing_meta): | |||
|     nlp = util.load_model_from_path(Path(model_path)) | ||||
|     meta['pipeline'] = nlp.pipe_names | ||||
|     meta['vectors'] = {'width': nlp.vocab.vectors_length, | ||||
|                        'entries': len(nlp.vocab.vectors)} | ||||
|                        'vectors': len(nlp.vocab.vectors), | ||||
|                        'keys': nlp.vocab.vectors.n_keys} | ||||
|     prints("Enter the package settings for your model. The following " | ||||
|            "information will be read from your model data: pipeline, vectors.", | ||||
|            title="Generating meta.json") | ||||
|  |  | |||
|  | @ -146,7 +146,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, | |||
|                 meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps, | ||||
|                                  'gpu': gpu_wps} | ||||
|                 meta['vectors'] = {'width': nlp.vocab.vectors_length, | ||||
|                                    'entries': len(nlp.vocab.vectors)} | ||||
|                                    'vectors': len(nlp.vocab.vectors), | ||||
|                                    'keys': nlp.vocab.vectors.n_keys} | ||||
|                 meta['lang'] = nlp.lang | ||||
|                 meta['pipeline'] = pipeline | ||||
|                 meta['spacy_version'] = '>=%s' % about.__version__ | ||||
|  |  | |||
|  | @ -155,7 +155,8 @@ class Language(object): | |||
|         self._meta.setdefault('url', '') | ||||
|         self._meta.setdefault('license', '') | ||||
|         self._meta['vectors'] = {'width': self.vocab.vectors_length, | ||||
|                                  'entries': len(self.vocab.vectors)} | ||||
|                                  'vectors': len(self.vocab.vectors), | ||||
|                                  'keys': self.vocab.vectors.n_keys} | ||||
|         self._meta['pipeline'] = self.pipe_names | ||||
|         return self._meta | ||||
| 
 | ||||
|  |  | |||
|  | @ -184,17 +184,18 @@ cdef class Vectors: | |||
|             yield key, self.data[row] | ||||
| 
 | ||||
|     def find(self, *, key=None, keys=None, row=None, rows=None): | ||||
|         '''Lookup one or more keys by row, or vice versa. | ||||
|         """Look up one or more keys by row, or vice versa. | ||||
| 
 | ||||
|         key (unicode / int): Find the row that the given key points to. | ||||
|             Returns int, -1 if missing. | ||||
|         keys (sequence): Find rows that the keys point to. | ||||
|         keys (iterable): Find rows that the keys point to. | ||||
|             Returns ndarray. | ||||
|         row (int): Find the first key that point to the row. | ||||
|             Returns int. | ||||
|         rows (sequence): Find the first keys that points to the rows. | ||||
|         rows (iterable): Find the keys that point to the rows. | ||||
|             Returns ndarray. | ||||
|         ''' | ||||
|         RETURNS: The requested key, keys, row or rows. | ||||
|         """ | ||||
|         if sum(arg is None for arg in (key, keys, row, rows)) != 3: | ||||
|             raise ValueError("One (and only one) keyword arg must be set.") | ||||
|         xp = get_array_module(self.data) | ||||
|  |  | |||
|  | @ -5,46 +5,47 @@ include ../_includes/_mixins | |||
| p | ||||
|     |  Vectors data is kept in the #[code Vectors.data] attribute, which should | ||||
|     |  be an instance of #[code numpy.ndarray] (for CPU vectors) or | ||||
|     |  #[code cupy.ndarray] (for GPU vectors). | ||||
|     |  #[code cupy.ndarray] (for GPU vectors). Multiple keys can be mapped to | ||||
|     |  the same vector, and not all of the rows in the table need to be | ||||
|     |  assigned – so #[code vectors.n_keys] may be greater or smaller than | ||||
|     |  #[code vectors.shape[0]]. | ||||
| 
 | ||||
| +h(2, "init") Vectors.__init__ | ||||
|     +tag method | ||||
| 
 | ||||
| p | ||||
|     |  Create a new vector store. To keep the vector table empty, pass | ||||
|     |  #[code width=0]. You can also create the vector table and add | ||||
|     |  vectors one by one, or set the vector values directly on initialisation. | ||||
|     |  Create a new vector store. You can set the vector values and keys | ||||
|     |  directly on initialisation, or supply a #[code shape] keyword argument | ||||
|     |  to create an empty table you can add vectors to later. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.vectors import Vectors | ||||
|     from spacy.strings import StringStore | ||||
| 
 | ||||
|     empty_vectors = Vectors(StringStore()) | ||||
|     empty_vectors = Vectors(shape=(10000, 300)) | ||||
| 
 | ||||
|     vectors = Vectors([u'cat'], width=300) | ||||
|     vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,)) | ||||
| 
 | ||||
|     vector_table = numpy.zeros((3, 300), dtype='f') | ||||
|     vectors = Vectors(StringStore(), data=vector_table) | ||||
|     data = numpy.zeros((3, 300), dtype='f') | ||||
|     keys = [u'cat', u'dog', u'rat'] | ||||
|     vectors = Vectors(data=data, keys=keys) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code strings] | ||||
|         +cell #[code StringStore] or list | ||||
|         +cell | ||||
|             |  List of strings, or a #[+api("stringstore") #[code StringStore]] | ||||
|             |  that maps strings to hash values, and vice versa. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code width] | ||||
|         +cell int | ||||
|         +cell Number of dimensions. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code data] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break ndarray[ndim=1, dtype='float32']] | ||||
|         +cell The vector data. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code keys] | ||||
|         +cell iterable | ||||
|         +cell A sequence of keys aligned with the data. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code shape] | ||||
|         +cell tuple | ||||
|         +cell | ||||
|             |  Size of the table as #[code (n_entries, n_columns)], the number | ||||
|             |  of entries and number of columns. Not required if you're | ||||
|             |  initialising the object with #[code data] and #[code keys]. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell #[code Vectors] | ||||
|  | @ -54,97 +55,92 @@ p | |||
|     +tag method | ||||
| 
 | ||||
| p | ||||
|     |  Get a vector by key. If key is a string, it is hashed to an integer ID | ||||
|     |  using the #[code Vectors.strings] table. If the integer key is not found | ||||
|     |  in the table, a #[code KeyError] is raised. | ||||
|     |  Get a vector by key. If the key is not found in the table, a | ||||
|     |  #[code KeyError] is raised. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vectors = Vectors(StringStore(), 300) | ||||
|     vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) | ||||
|     cat_vector = vectors[u'cat'] | ||||
|     cat_id = nlp.vocab.strings[u'cat'] | ||||
|     cat_vector = nlp.vocab.vectors[cat_id] | ||||
|     assert cat_vector == nlp.vocab[u'cat'].vector | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code key] | ||||
|         +cell unicode / int | ||||
|         +cell int | ||||
|         +cell The key to get the vector for. | ||||
| 
 | ||||
|     +row | ||||
|         +cell returns | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break ndarray[ndim=1, dtype='float32']] | ||||
|         +cell The vector for the key. | ||||
| 
 | ||||
| +h(2, "setitem") Vectors.__setitem__ | ||||
|     +tag method | ||||
| 
 | ||||
| p | ||||
|     |  Set a vector for the given key. If key is a string, it is hashed to an | ||||
|     |  integer ID using the #[code Vectors.strings] table. | ||||
|     |  Set a vector for the given key. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vectors = Vectors(StringStore(), 300) | ||||
|     vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,)) | ||||
|     cat_id = nlp.vocab.strings[u'cat'] | ||||
|     vector = numpy.random.uniform(-1, 1, (300,)) | ||||
|     nlp.vocab.vectors[cat_id] = vector | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code key] | ||||
|         +cell unicode / int | ||||
|         +cell int | ||||
|         +cell The key to set the vector for. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code vector] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break ndarray[ndim=1, dtype='float32']] | ||||
|         +cell The vector to set. | ||||
| 
 | ||||
| +h(2, "iter") Vectors.__iter__ | ||||
|     +tag method | ||||
| 
 | ||||
| p Yield vectors from the table. | ||||
| p Iterate over the keys in the table. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vector_table = numpy.zeros((3, 300), dtype='f') | ||||
|     vectors = Vectors(StringStore(), vector_table) | ||||
|     for vector in vectors: | ||||
|         print(vector) | ||||
|     for key in nlp.vocab.vectors: | ||||
|         print(key, nlp.vocab.strings[key]) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row("foot") | ||||
|         +cell yields | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A vector from the table. | ||||
|         +cell int | ||||
|         +cell A key in the table. | ||||
| 
 | ||||
| +h(2, "len") Vectors.__len__ | ||||
|     +tag method | ||||
| 
 | ||||
| p Return the number of vectors that have been assigned. | ||||
| p Return the number of vectors in the table. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vector_table = numpy.zeros((3, 300), dtype='f') | ||||
|     vectors = Vectors(StringStore(), vector_table) | ||||
|     vectors = Vectors(shape=(3, 300)) | ||||
|     assert len(vectors) == 3 | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell int | ||||
|         +cell The number of vectors in the data. | ||||
|         +cell The number of vectors in the table. | ||||
| 
 | ||||
| +h(2, "contains") Vectors.__contains__ | ||||
|     +tag method | ||||
| 
 | ||||
| p | ||||
|     |  Check whether a key has a vector entry in the table. If key is a string, | ||||
|     |  it is hashed to an integer ID using the #[code Vectors.strings] table. | ||||
|     |  Check whether a key has been mapped to a vector entry in the table. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vectors = Vectors(StringStore(), 300) | ||||
|     vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) | ||||
|     assert u'cat' in vectors | ||||
|     cat_id = nlp.vocab.strings[u'cat'] | ||||
|     nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,))) | ||||
|     assert cat_id in vectors | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code key] | ||||
|         +cell unicode / int | ||||
|         +cell int | ||||
|         +cell The key to check. | ||||
| 
 | ||||
|     +row("foot") | ||||
|  | @ -156,13 +152,20 @@ p | |||
|     +tag method | ||||
| 
 | ||||
| p | ||||
|     |  Add a key to the table, optionally setting a vector value as well. If | ||||
|     |  key is a string, it is hashed to an integer ID using the | ||||
|     |  #[code Vectors.strings] table. | ||||
|     |  Add a key to the table, optionally setting a vector value as well. Keys | ||||
|     |  can be mapped to an existing vector by setting #[code row], or a new | ||||
|     |  vector can be added. When adding unicode keys, keep in mind that the | ||||
|     |  #[code Vectors] class itself has no | ||||
|     |  #[+api("stringstore") #[code StringStore]], so you have to store the | ||||
|     |  hash-to-string mapping separately. If you need to manage the strings, | ||||
|     |  you should use the #[code Vectors] via the | ||||
|     |  #[+api("vocab") #[code Vocab]] class, e.g. #[code vocab.vectors]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vectors = Vectors(StringStore(), 300) | ||||
|     vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) | ||||
|     vector = numpy.random.uniform(-1, 1, (300,)) | ||||
|     cat_id = nlp.vocab.strings[u'cat'] | ||||
|     nlp.vocab.vectors.add(cat_id, vector=vector) | ||||
|     nlp.vocab.vectors.add(u'dog', row=0) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|  | @ -172,25 +175,66 @@ p | |||
| 
 | ||||
|     +row | ||||
|         +cell #[code vector] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell An optional vector to add. | ||||
|         +cell #[code.u-break ndarray[ndim=1, dtype='float32']] | ||||
|         +cell An optional vector to add for the key. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code row] | ||||
|         +cell int | ||||
|         +cell An optional row number of a vector to map the key to. | ||||
| 
 | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell int | ||||
|         +cell The row the vector was added to. | ||||
| 
 | ||||
| +h(2, "keys") Vectors.keys | ||||
|     +tag method | ||||
| 
 | ||||
| p A sequence of the keys in the table. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     for key in nlp.vocab.vectors.keys(): | ||||
|         print(key, nlp.vocab.strings[key]) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell iterable | ||||
|         +cell The keys. | ||||
| 
 | ||||
| +h(2, "values") Vectors.values | ||||
|     +tag method | ||||
| 
 | ||||
| p | ||||
|     |  Iterate over vectors that have been assigned to at least one key. Note | ||||
|     |  that some vectors may be unassigned, so the number of vectors returned | ||||
|     |  may be less than the length of the vectors table. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     for vector in nlp.vocab.vectors.values(): | ||||
|         print(vector) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row("foot") | ||||
|         +cell yields | ||||
|         +cell #[code.u-break ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A vector in the table. | ||||
| 
 | ||||
| +h(2, "items") Vectors.items | ||||
|     +tag method | ||||
| 
 | ||||
| p Iterate over #[code (string key, vector)] pairs, in order. | ||||
| p Iterate over #[code (key, vector)] pairs, in order. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vectors = Vectors(StringStore(), 300) | ||||
|     vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) | ||||
|     for key, vector in vectors.items(): | ||||
|         print(key, vector) | ||||
|     for key, vector in nlp.vocab.vectors.items(): | ||||
|         print(key, nlp.vocab.strings[key], vector) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row("foot") | ||||
|         +cell yields | ||||
|         +cell tuple | ||||
|         +cell #[code (string key, vector)] pairs, in order. | ||||
|         +cell #[code (key, vector)] pairs, in order. | ||||
| 
 | ||||
| +h(2, "shape") Vectors.shape | ||||
|     +tag property | ||||
|  | @ -200,7 +244,7 @@ p | |||
|     |  dimensions in the vector table. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vectors = Vectors(StringStore(), 300) | ||||
|     vectors = Vectors(shape(1, 300)) | ||||
|     vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) | ||||
|     rows, dims = vectors.shape | ||||
|     assert rows == 1 | ||||
|  | @ -212,6 +256,59 @@ p | |||
|         +cell tuple | ||||
|         +cell A #[code (rows, dims)] pair. | ||||
| 
 | ||||
| +h(2, "size") Vectors.size | ||||
|     +tag property | ||||
| 
 | ||||
| p The vector size, i.e. #[code rows * dims]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vectors = Vectors(shape=(500, 300)) | ||||
|     assert vectors.size == 150000 | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell int | ||||
|         +cell The vector size. | ||||
| 
 | ||||
| +h(2, "is_full") Vectors.is_full | ||||
|     +tag property | ||||
| 
 | ||||
| p | ||||
|     |  Whether the vectors table is full and has no slots are available for new | ||||
|     |  keys. If a table is full, it can be resized using | ||||
|     |  #[+api("vectors#resize") #[code Vectors.resize]]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vectors = Vectors(shape=(1, 300)) | ||||
|     vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,))) | ||||
|     assert vectors.is_full | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell bool | ||||
|         +cell Whether the vectors table is full. | ||||
| 
 | ||||
| +h(2, "n_keys") Vectors.n_keys | ||||
|     +tag property | ||||
| 
 | ||||
| p | ||||
|     |  Get the number of keys in the table. Note that this is the number of | ||||
|     |  #[em all] keys, not just unique vectors. If several keys are mapped | ||||
|     |  are mapped to the same vectors, they will be counted individually. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vectors = Vectors(shape=(10, 300)) | ||||
|     assert len(vectors) == 10 | ||||
|     assert vectors.n_keys == 0 | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row("foot") | ||||
|         +cell returns | ||||
|         +cell int | ||||
|         +cell The number of all keys in the table. | ||||
| 
 | ||||
| +h(2, "from_glove") Vectors.from_glove | ||||
|     +tag method | ||||
| 
 | ||||
|  | @ -223,6 +320,10 @@ p | |||
|     |  float32 vectors, #[code vectors.300.d.bin] for 300d float64 (double) | ||||
|     |  vectors, etc. By default GloVe outputs 64-bit vectors. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     vectors = Vectors() | ||||
|     vectors.from_glove('/path/to/glove_vectors') | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code path] | ||||
|  | @ -323,7 +424,7 @@ p Load state from a binary string. | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code data] | ||||
|         +cell #[code numpy.ndarray] / #[code cupy.ndarray] | ||||
|         +cell #[code.u-break ndarray[ndim=1, dtype='float32']] | ||||
|         +cell | ||||
|             |  Stored vectors data. #[code numpy] is used for CPU vectors, | ||||
|             |  #[code cupy] for GPU vectors. | ||||
|  | @ -337,7 +438,7 @@ p Load state from a binary string. | |||
| 
 | ||||
|     +row | ||||
|         +cell #[code keys] | ||||
|         +cell #[code numpy.ndarray] | ||||
|         +cell #[code.u-break ndarray[ndim=1, dtype='float32']] | ||||
|         +cell | ||||
|             |  Array keeping the keys in order, such that | ||||
|             |  #[code keys[vectors.key2row[key]] == key] | ||||
|  |  | |||
|  | @ -47,7 +47,7 @@ | |||
|     font: 600 1.1rem/#{1} $font-secondary | ||||
|     background: $color-theme | ||||
|     color: $color-back | ||||
|     padding: 0.15em 0.5em 0.35em | ||||
|     padding: 2px 6px 4px | ||||
|     border-radius: 1em | ||||
|     text-transform: uppercase | ||||
|     vertical-align: middle | ||||
|  |  | |||
|  | @ -1,6 +1,6 @@ | |||
| 'use strict'; | ||||
| 
 | ||||
| import { Templater, handleResponse, convertNumber } from './util.js'; | ||||
| import { Templater, handleResponse, convertNumber, abbrNumber } from './util.js'; | ||||
| 
 | ||||
| /** | ||||
|  * Chart.js defaults | ||||
|  | @ -25,7 +25,7 @@ export const formats = { | |||
|     license: (license, url) => url ? `<a href="${url}" target="_blank">${license}</a>` : license, | ||||
|     sources: sources => (sources instanceof Array) ? sources.join(', ') : sources, | ||||
|     pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `<code>${p}</code>`).join(', ') : '-', | ||||
|     vectors: vec => vec ? `${convertNumber(vec.entries)} (${vec.width} dimensions)` : 'n/a', | ||||
|     vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a', | ||||
|     version: version => `<code>v${version}</code>` | ||||
| }; | ||||
| 
 | ||||
|  | @ -240,7 +240,8 @@ export class ModelComparer { | |||
|         return data; | ||||
|     } | ||||
| 
 | ||||
|     showError() { | ||||
|     showError(err) { | ||||
|         console.error(err); | ||||
|         this.tpl.get('result').style.display = 'none'; | ||||
|         this.tpl.get('error').style.display = 'block'; | ||||
|     } | ||||
|  |  | |||
|  | @ -46,11 +46,24 @@ export const handleResponse = res => { | |||
|     else return ({ ok: res.ok }) | ||||
| }; | ||||
| 
 | ||||
| 
 | ||||
| /** | ||||
|  * Convert a number to a string and add thousand separator. | ||||
|  * @param {number|string} num - The number to convert. | ||||
|  * @param {string} separator – Thousand separator. | ||||
|  */ | ||||
| export const convertNumber = (num, separator = ',') => | ||||
| export const convertNumber = (num = 0, separator = ',') => | ||||
|     num.toString().replace(/\B(?=(\d{3})+(?!\d))/g, separator); | ||||
| 
 | ||||
| /** | ||||
|  * Abbreviate a number, e.g. 14249930 --> 14.25m. | ||||
|  * @param {number|string} num - The number to convert. | ||||
|  * @param {number} fixed - Number of decimals. | ||||
|  */ | ||||
| export const abbrNumber = (num = 0, fixed = 2) => { | ||||
|     const suffixes = ['', 'k', 'm', 'b', 't']; | ||||
|     if (num === null || num === 0) return 0; | ||||
|     const b = num.toPrecision(2).split('e'); | ||||
|     const k = (b.length === 1) ? 0 : Math.floor(Math.min(b[1].slice(1), 14) / 3); | ||||
|     const c = (k < 1) ? num.toFixed(fixed) : (num / Math.pow(10, k * 3)).toFixed(fixed + 1); | ||||
|     return (c < 0 ? c : Math.abs(c)) + suffixes[k]; | ||||
| } | ||||
|  |  | |||
|  | @ -100,6 +100,7 @@ | |||
|         "hu": "Hungarian", | ||||
|         "pl": "Polish", | ||||
|         "he": "Hebrew", | ||||
|         "ga": "Irish", | ||||
|         "bn": "Bengali", | ||||
|         "hi": "Hindi", | ||||
|         "id": "Indonesian", | ||||
|  | @ -114,6 +115,8 @@ | |||
|         "de": "Dies ist ein Satz.", | ||||
|         "fr": "C'est une phrase.", | ||||
|         "es": "Esto es una frase.", | ||||
|         "pt": "Esta é uma frase.", | ||||
|         "it": "Questa è una frase.", | ||||
|         "xx": "This is a sentence about Facebook." | ||||
|     } | ||||
| } | ||||
|  |  | |||
|  | @ -116,7 +116,6 @@ | |||
|         "next": "text-classification", | ||||
|         "menu": { | ||||
|             "Basics": "basics", | ||||
|             "Similarity in Context": "in-context", | ||||
|             "Custom Vectors": "custom", | ||||
|             "GPU Usage": "gpu" | ||||
|         } | ||||
|  |  | |||
|  | @ -19,6 +19,7 @@ | |||
| 
 | ||||
|     +qs({package: 'source'}) git clone https://github.com/explosion/spaCy | ||||
|     +qs({package: 'source'}) cd spaCy | ||||
|     +qs({package: 'source'}) export PYTHONPATH=`pwd` | ||||
|     +qs({package: 'source'}) pip install -r requirements.txt | ||||
|     +qs({package: 'source'}) pip install -e . | ||||
| 
 | ||||
|  |  | |||
|  | @ -46,7 +46,6 @@ p | |||
|         +item #[strong Chinese]: #[+a("https://github.com/fxsjy/jieba") Jieba] | ||||
|         +item #[strong Japanese]: #[+a("https://github.com/mocobeta/janome") Janome] | ||||
|         +item #[strong Thai]: #[+a("https://github.com/wannaphongcom/pythainlp") pythainlp] | ||||
|         +item #[strong Russian]: #[+a("https://github.com/kmike/pymorphy2") pymorphy2] | ||||
| 
 | ||||
| +h(3, "multi-language") Multi-language support | ||||
|     +tag-new(2) | ||||
|  |  | |||
|  | @ -13,3 +13,127 @@ | |||
| 
 | ||||
| include ../_spacy-101/_similarity | ||||
| include ../_spacy-101/_word-vectors | ||||
| 
 | ||||
| +h(3, "in-context") Similarities in context | ||||
| 
 | ||||
| p | ||||
|     |  Aside from spaCy's built-in word vectors, which were trained on a lot of | ||||
|     |  text with a wide vocabulary, the parsing, tagging and NER models also | ||||
|     |  rely on vector representations of the #[strong meanings of words in context]. | ||||
|     |  As the first component of the | ||||
|     |  #[+a("/usage/processing-pipelines") processing pipeline], the | ||||
|     |  tensorizer encodes a document's internal meaning representations as an | ||||
|     |  array of floats, also called a tensor. This allows spaCy to make a | ||||
|     |  reasonable guess at a word's meaning, based on its surrounding words. | ||||
|     |  Even if a word hasn't been seen before, spaCy will know #[em something] | ||||
|     |  about it. Because spaCy uses a 4-layer convolutional network, the | ||||
|     |  tensors are sensitive to up to #[strong four words on either side] of a | ||||
|     |  word. | ||||
| 
 | ||||
| p | ||||
|     |  For example, here are three sentences containing the out-of-vocabulary | ||||
|     |  word "labrador" in different contexts. | ||||
| 
 | ||||
| +code. | ||||
|     doc1 = nlp(u"The labrador barked.") | ||||
|     doc2 = nlp(u"The labrador swam.") | ||||
|     doc3 = nlp(u"the labrador people live in canada.") | ||||
| 
 | ||||
|     for doc in [doc1, doc2, doc3]: | ||||
|         labrador = doc[1] | ||||
|         dog = nlp(u"dog") | ||||
|         print(labrador.similarity(dog)) | ||||
| 
 | ||||
| p | ||||
|     |  Even though the model has never seen the word "labrador", it can make a | ||||
|     |  fairly accurate prediction of its similarity to "dog" in different | ||||
|     |  contexts. | ||||
| 
 | ||||
| +table(["Context", "labrador.similarity(dog)"]) | ||||
|     +row | ||||
|         +cell The #[strong labrador] barked. | ||||
|         +cell #[code 0.56] #[+procon("yes", "similar")] | ||||
| 
 | ||||
|     +row | ||||
|         +cell The #[strong labrador] swam. | ||||
|         +cell #[code 0.48] #[+procon("no", "dissimilar")] | ||||
| 
 | ||||
|     +row | ||||
|         +cell the #[strong labrador] people live in canada. | ||||
|         +cell #[code 0.39] #[+procon("no", "dissimilar")] | ||||
| 
 | ||||
| p | ||||
|     |  The same also works for whole documents. Here, the variance of the | ||||
|     |  similarities is lower, as all words and their order are taken into | ||||
|     |  account. However, the context-specific similarity is often still | ||||
|     |  reflected pretty accurately. | ||||
| 
 | ||||
| +code. | ||||
|     doc1 = nlp(u"Paris is the largest city in France.") | ||||
|     doc2 = nlp(u"Vilnius is the capital of Lithuania.") | ||||
|     doc3 = nlp(u"An emu is a large bird.") | ||||
| 
 | ||||
|     for doc in [doc1, doc2, doc3]: | ||||
|         for other_doc in [doc1, doc2, doc3]: | ||||
|             print(doc.similarity(other_doc)) | ||||
| 
 | ||||
| p | ||||
|     |  Even though the sentences about Paris and Vilnius consist of different | ||||
|     |  words and entities, they both describe the same concept and are seen as | ||||
|     |  more similar than the sentence about emus. In this case, even a misspelled | ||||
|     |  version of "Vilnius" would still produce very similar results. | ||||
| 
 | ||||
| +table | ||||
|     - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]} | ||||
|     - var counter = 0 | ||||
| 
 | ||||
|     +row | ||||
|     +row | ||||
|         +cell | ||||
|         for _, label in examples | ||||
|             +cell=label | ||||
| 
 | ||||
|     each cells, label in examples | ||||
|         +row(counter ? null : "divider") | ||||
|             +cell=label | ||||
|             for cell in cells | ||||
|                 +cell.u-text-center | ||||
|                     - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"] | ||||
|                     |  #[code=cell.toFixed(2)] #[+procon(...result)] | ||||
|         - counter++ | ||||
| 
 | ||||
| p | ||||
|     |  Sentences that consist of the same words in different order will likely | ||||
|     |  be seen as very similar – but never identical. | ||||
| 
 | ||||
| +code. | ||||
|     docs = [nlp(u"dog bites man"), nlp(u"man bites dog"), | ||||
|             nlp(u"man dog bites"), nlp(u"dog man bites")] | ||||
| 
 | ||||
|     for doc in docs: | ||||
|         for other_doc in docs: | ||||
|             print(doc.similarity(other_doc)) | ||||
| 
 | ||||
| p | ||||
|     |  Interestingly, "man bites dog" and "man dog bites" are seen as slightly | ||||
|     |  more similar than "man bites dog" and "dog bites man". This may be a | ||||
|     |  conincidence – or the result of "man" being interpreted as both sentence's | ||||
|     |  subject. | ||||
| 
 | ||||
| +table | ||||
|     - var examples = {"dog bites man": [1, 0.9, 0.89, 0.92], "man bites dog": [0.9, 1, 0.93, 0.9], "man dog bites": [0.89, 0.93, 1, 0.92], "dog man bites": [0.92, 0.9, 0.92, 1]} | ||||
|     - var counter = 0 | ||||
| 
 | ||||
|     +row("head") | ||||
|         +cell | ||||
|         for _, label in examples | ||||
|             +cell.u-text-center=label | ||||
| 
 | ||||
|     each cells, label in examples | ||||
|         +row(counter ? null : "divider") | ||||
|             +cell=label | ||||
|             for cell in cells | ||||
|                 +cell.u-text-center | ||||
|                     - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"] | ||||
|                     |  #[code=cell.toFixed(2)] #[+procon(...result)] | ||||
|         - counter++ | ||||
|  |  | |||
|  | @ -1,49 +1,137 @@ | |||
| //- 💫 DOCS > USAGE > VECTORS & SIMILARITY > CUSTOM VECTORS | ||||
| 
 | ||||
| p | ||||
|     |  By default, #[+api("token#vector") #[code Token.vector]] returns the | ||||
|     |  vector for its underlying #[+api("lexeme") #[code Lexeme]], while | ||||
|     |  #[+api("doc#vector") #[code Doc.vector]] and | ||||
|     |  #[+api("span#vector") #[code Span.vector]] return an average of the | ||||
|     |  vectors of their tokens. You can customize these | ||||
|     |  behaviours by modifying the #[code doc.user_hooks], | ||||
|     |  #[code doc.user_span_hooks] and #[code doc.user_token_hooks] | ||||
|     |  dictionaries. | ||||
|     |  Word vectors let you import knowledge from raw text into your model. The | ||||
|     |  knowledge is represented as a table of numbers, with one row per term in | ||||
|     |  your vocabulary. If two terms are used in similar contexts, the algorithm | ||||
|     |  that learns the vectors should assign them | ||||
|     |  #[strong rows that are quite similar], while words that are used in | ||||
|     |  different contexts will have quite different values. This lets you use | ||||
|     |  the row-values assigned to the words as a kind of dictionary, to tell you | ||||
|     |  some things about what the words in your text mean. | ||||
| 
 | ||||
| +infobox | ||||
|     |  For more details on #[strong adding hooks] and #[strong overwriting] the | ||||
|     |  built-in #[code Doc], #[code Span] and #[code Token] methods, see the | ||||
|     |  usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks]. | ||||
| p | ||||
|     |  Word vectors are particularly useful for terms which | ||||
|     |  #[strong aren't well represented in your labelled training data]. | ||||
|     |  For instance, if you're doing named entity recognition, there will always | ||||
|     |  be lots of names that you don't have examples of. For instance, imagine | ||||
|     |  your training data happens to contain some examples of the term | ||||
|     |  "Microsoft", but it doesn't contain any examples of the term "Symantec". | ||||
|     |  In your raw text sample, there are plenty of examples of both terms, and | ||||
|     |  they're used in similar contexts. The word vectors make that fact | ||||
|     |  available to the entity recognition model. It still won't see examples of | ||||
|     |  "Symantec" labelled as a company. However, it'll see that "Symantec" has | ||||
|     |  a word vector that usually corresponds to company terms, so it can | ||||
|     |  #[strong make the inference]. | ||||
| 
 | ||||
| p | ||||
|     |  In order to make best use of the word vectors, you want the word vectors | ||||
|     |  table to cover a #[strong very large vocabulary]. However, most words are | ||||
|     |  rare, so most of the rows in a large word vectors table will be accessed | ||||
|     |  very rarely, or never at all. You can usually cover more than | ||||
|     |  #[strong 95% of the tokens] in your corpus with just | ||||
|     |  #[strong a few thousand rows] in the vector table. However, it's those | ||||
|     |  #[strong 5% of rare terms] where the word vectors are | ||||
|     |  #[strong most useful]. The problem is that increasing the size of the | ||||
|     |  vector table produces rapidly diminishing returns in coverage over these | ||||
|     |  rare terms. | ||||
| 
 | ||||
| +h(3, "custom-vectors-coverage") Optimising vector coverage | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  To help you strike a good balance between coverage and memory usage, | ||||
|     |  spaCy's #[+api("vectors") #[code Vectors]] class lets you map | ||||
|     |  #[strong multiple keys] to the #[strong same row] of the table. If | ||||
|     |  you're using the #[+api("cli#vocab") #[code spacy vocab]] command to | ||||
|     |  create a vocabulary, pruning the vectors will be taken care of | ||||
|     |  automatically. You can also do it manually in the following steps: | ||||
| 
 | ||||
| +list("numbers") | ||||
|     +item | ||||
|         |  Start with a #[strong word vectors model] that covers a huge | ||||
|         |  vocabulary. For instance, the | ||||
|         |  #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]] model | ||||
|         |  provides 300-dimensional GloVe vectors for over 1 million terms of | ||||
|         |  English. | ||||
| 
 | ||||
|     +item | ||||
|         |  If your vocabulary has values set for the #[code Lexeme.prob] | ||||
|         |  attribute, the lexemes will be sorted by descending probability to | ||||
|         |  determine which vectors to prune. Otherwise, lexemes will be sorted | ||||
|         |  by their order in the #[code Vocab]. | ||||
| 
 | ||||
|     +item | ||||
|         |  Call #[+api("vocab#prune_vectors") #[code Vocab.prune_vectors]] with | ||||
|         |  the number of vectors you want to keep. | ||||
| 
 | ||||
| +code. | ||||
|     nlp = spacy.load('en_vectors_web_lg') | ||||
|     n_vectors = 105000  # number of vectors to keep | ||||
|     removed_words = nlp.vocab.prune_vectors(n_vectors) | ||||
| 
 | ||||
|     assert len(nlp.vocab.vectors) <= n_vectors  # unique vectors have been pruned | ||||
|     assert nlp.vocab.vectors.n_keys > n_vectors  # but not the total entries | ||||
| 
 | ||||
| p | ||||
|     |  #[+api("vocab#prune_vectors") #[code Vocab.prune_vectors]] reduces the | ||||
|     |  current vector table to a given number of unique entries, and returns a | ||||
|     |  dictionary containing the removed words, mapped to #[code (string, score)] | ||||
|     |  tuples, where #[code string] is the entry the removed word was mapped | ||||
|     |  to, and #[code score] the similarity score between the two words. | ||||
| 
 | ||||
| +code("Removed words"). | ||||
|     { | ||||
|         'Shore': ('coast', 0.732257), | ||||
|         'Precautionary': ('caution', 0.490973), | ||||
|         'hopelessness': ('sadness', 0.742366), | ||||
|         'Continous': ('continuous', 0.732549), | ||||
|         'Disemboweled': ('corpse', 0.499432), | ||||
|         'biostatistician': ('scientist', 0.339724), | ||||
|         'somewheres': ('somewheres', 0.402736), | ||||
|         'observing': ('observe', 0.823096), | ||||
|         'Leaving': ('leaving', 1.0) | ||||
|     } | ||||
| 
 | ||||
| p | ||||
|     |  In the example above, the vector for "Shore" was removed and remapped | ||||
|     |  to the vector of "coast", which is deemed about 73% similar. "Leaving" | ||||
|     |  was remapped to the vector of "leaving", which is identical. | ||||
| 
 | ||||
| +h(3, "custom-vectors-add") Adding vectors | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  The new #[+api("vectors") #[code Vectors]] class makes it easy to add | ||||
|     |  your own vectors to spaCy. Just like the #[+api("vocab") #[code Vocab]], | ||||
|     |  it is initialised with a #[+api("stringstore") #[code StringStore]] or | ||||
|     |  a list of strings. | ||||
|     |  spaCy's new #[+api("vectors") #[code Vectors]] class greatly improves the | ||||
|     |  way word vectors are stored, accessed and used. The data is stored in | ||||
|     |  two structures: | ||||
| 
 | ||||
| +code("Adding vectors one-by-one"). | ||||
|     from spacy.strings import StringStore | ||||
|     from spacy.vectors import Vectors | ||||
| +list | ||||
|     +item | ||||
|         |  An array, which can be either on CPU or #[+a("#gpu") GPU]. | ||||
| 
 | ||||
|     vector_data = {'dog': numpy.random.uniform(-1, 1, (300,)), | ||||
|                    'cat': numpy.random.uniform(-1, 1, (300,)), | ||||
|                    'orange': numpy.random.uniform(-1, 1, (300,))} | ||||
| 
 | ||||
|     vectors = Vectors(StringStore(), 300) | ||||
|     for word, vector in vector_data.items(): | ||||
|         vectors.add(word, vector) | ||||
|     +item | ||||
|         |  A dictionary mapping string-hashes to rows in the table. | ||||
| 
 | ||||
| p | ||||
|     |  You can also add the vector values directly on initialisation: | ||||
|     |  Keep in mind that the #[code Vectors] class itself has no | ||||
|     |  #[+api("stringstore") #[code StringStore]], so you have to store the | ||||
|     |  hash-to-string mapping separately. If you need to manage the strings, | ||||
|     |  you should use the #[code Vectors] via the | ||||
|     |  #[+api("vocab") #[code Vocab]] class, e.g. #[code vocab.vectors]. To | ||||
|     |  add vectors to the vocabulary, you can use the | ||||
|     |  #[+api("vocab#set_vector") #[code Vocab.set_vector]] method. | ||||
| 
 | ||||
| +code("Adding vectors on initialisation"). | ||||
|     from spacy.vectors import Vectors | ||||
| +code("Adding vectors"). | ||||
|     from spacy.vocab import Vocab | ||||
| 
 | ||||
|     vector_table = numpy.zeros((3, 300), dtype='f') | ||||
|     vectors = Vectors([u'dog', u'cat', u'orange'], vector_table) | ||||
|     vector_data = {u'dog': numpy.random.uniform(-1, 1, (300,)), | ||||
|                    u'cat': numpy.random.uniform(-1, 1, (300,)), | ||||
|                    u'orange': numpy.random.uniform(-1, 1, (300,))} | ||||
| 
 | ||||
|     vocab = Vocab() | ||||
|     for word, vector in vector_data.items(): | ||||
|         vocab.set_vector(word, vector) | ||||
| 
 | ||||
| +h(3, "custom-loading-glove") Loading GloVe vectors | ||||
|     +tag-new(2) | ||||
|  | @ -89,3 +177,20 @@ p | |||
|     |  #[+api("vocab#set_vector") #[code set_vector]] method. | ||||
| 
 | ||||
| +github("spacy", "examples/vectors_fast_text.py") | ||||
| 
 | ||||
| +h(3, "custom-similarity") Using custom similarity methods | ||||
| 
 | ||||
| p | ||||
|     |  By default, #[+api("token#vector") #[code Token.vector]] returns the | ||||
|     |  vector for its underlying #[+api("lexeme") #[code Lexeme]], while | ||||
|     |  #[+api("doc#vector") #[code Doc.vector]] and | ||||
|     |  #[+api("span#vector") #[code Span.vector]] return an average of the | ||||
|     |  vectors of their tokens. You can customise these | ||||
|     |  behaviours by modifying the #[code doc.user_hooks], | ||||
|     |  #[code doc.user_span_hooks] and #[code doc.user_token_hooks] | ||||
|     |  dictionaries. | ||||
| 
 | ||||
| +infobox | ||||
|     |  For more details on #[strong adding hooks] and #[strong overwriting] the | ||||
|     |  built-in #[code Doc], #[code Span] and #[code Token] methods, see the | ||||
|     |  usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks]. | ||||
|  |  | |||
|  | @ -1,123 +0,0 @@ | |||
| //- 💫 DOCS > USAGE > VECTORS & SIMILARITY > IN CONTEXT | ||||
| 
 | ||||
| p | ||||
|     |  Aside from spaCy's built-in word vectors, which were trained on a lot of | ||||
|     |  text with a wide vocabulary, the parsing, tagging and NER models also | ||||
|     |  rely on vector representations of the #[strong meanings of words in context]. | ||||
|     |  As the first component of the | ||||
|     |  #[+a("/usage/processing-pipelines") processing pipeline], the | ||||
|     |  tensorizer encodes a document's internal meaning representations as an | ||||
|     |  array of floats, also called a tensor. This allows spaCy to make a | ||||
|     |  reasonable guess at a word's meaning, based on its surrounding words. | ||||
|     |  Even if a word hasn't been seen before, spaCy will know #[em something] | ||||
|     |  about it. Because spaCy uses a 4-layer convolutional network, the | ||||
|     |  tensors are sensitive to up to #[strong four words on either side] of a | ||||
|     |  word. | ||||
| 
 | ||||
| p | ||||
|     |  For example, here are three sentences containing the out-of-vocabulary | ||||
|     |  word "labrador" in different contexts. | ||||
| 
 | ||||
| +code. | ||||
|     doc1 = nlp(u"The labrador barked.") | ||||
|     doc2 = nlp(u"The labrador swam.") | ||||
|     doc3 = nlp(u"the labrador people live in canada.") | ||||
| 
 | ||||
|     for doc in [doc1, doc2, doc3]: | ||||
|         labrador = doc[1] | ||||
|         dog = nlp(u"dog") | ||||
|         print(labrador.similarity(dog)) | ||||
| 
 | ||||
| p | ||||
|     |  Even though the model has never seen the word "labrador", it can make a | ||||
|     |  fairly accurate prediction of its similarity to "dog" in different | ||||
|     |  contexts. | ||||
| 
 | ||||
| +table(["Context", "labrador.similarity(dog)"]) | ||||
|     +row | ||||
|         +cell The #[strong labrador] barked. | ||||
|         +cell #[code 0.56] #[+procon("yes", "similar")] | ||||
| 
 | ||||
|     +row | ||||
|         +cell The #[strong labrador] swam. | ||||
|         +cell #[code 0.48] #[+procon("no", "dissimilar")] | ||||
| 
 | ||||
|     +row | ||||
|         +cell the #[strong labrador] people live in canada. | ||||
|         +cell #[code 0.39] #[+procon("no", "dissimilar")] | ||||
| 
 | ||||
| p | ||||
|     |  The same also works for whole documents. Here, the variance of the | ||||
|     |  similarities is lower, as all words and their order are taken into | ||||
|     |  account. However, the context-specific similarity is often still | ||||
|     |  reflected pretty accurately. | ||||
| 
 | ||||
| +code. | ||||
|     doc1 = nlp(u"Paris is the largest city in France.") | ||||
|     doc2 = nlp(u"Vilnius is the capital of Lithuania.") | ||||
|     doc3 = nlp(u"An emu is a large bird.") | ||||
| 
 | ||||
|     for doc in [doc1, doc2, doc3]: | ||||
|         for other_doc in [doc1, doc2, doc3]: | ||||
|             print(doc.similarity(other_doc)) | ||||
| 
 | ||||
| p | ||||
|     |  Even though the sentences about Paris and Vilnius consist of different | ||||
|     |  words and entities, they both describe the same concept and are seen as | ||||
|     |  more similar than the sentence about emus. In this case, even a misspelled | ||||
|     |  version of "Vilnius" would still produce very similar results. | ||||
| 
 | ||||
| +table | ||||
|     - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]} | ||||
|     - var counter = 0 | ||||
| 
 | ||||
|     +row | ||||
|     +row | ||||
|         +cell | ||||
|         for _, label in examples | ||||
|             +cell=label | ||||
| 
 | ||||
|     each cells, label in examples | ||||
|         +row(counter ? null : "divider") | ||||
|             +cell=label | ||||
|             for cell in cells | ||||
|                 +cell.u-text-center | ||||
|                     - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"] | ||||
|                     |  #[code=cell.toFixed(2)] #[+procon(...result)] | ||||
|         - counter++ | ||||
| 
 | ||||
| p | ||||
|     |  Sentences that consist of the same words in different order will likely | ||||
|     |  be seen as very similar – but never identical. | ||||
| 
 | ||||
| +code. | ||||
|     docs = [nlp(u"dog bites man"), nlp(u"man bites dog"), | ||||
|             nlp(u"man dog bites"), nlp(u"dog man bites")] | ||||
| 
 | ||||
|     for doc in docs: | ||||
|         for other_doc in docs: | ||||
|             print(doc.similarity(other_doc)) | ||||
| 
 | ||||
| p | ||||
|     |  Interestingly, "man bites dog" and "man dog bites" are seen as slightly | ||||
|     |  more similar than "man bites dog" and "dog bites man". This may be a | ||||
|     |  conincidence – or the result of "man" being interpreted as both sentence's | ||||
|     |  subject. | ||||
| 
 | ||||
| +table | ||||
|     - var examples = {"dog bites man": [1, 0.9, 0.89, 0.92], "man bites dog": [0.9, 1, 0.93, 0.9], "man dog bites": [0.89, 0.93, 1, 0.92], "dog man bites": [0.92, 0.9, 0.92, 1]} | ||||
|     - var counter = 0 | ||||
| 
 | ||||
|     +row("head") | ||||
|         +cell | ||||
|         for _, label in examples | ||||
|             +cell.u-text-center=label | ||||
| 
 | ||||
|     each cells, label in examples | ||||
|         +row(counter ? null : "divider") | ||||
|             +cell=label | ||||
|             for cell in cells | ||||
|                 +cell.u-text-center | ||||
|                     - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"] | ||||
|                     |  #[code=cell.toFixed(2)] #[+procon(...result)] | ||||
|         - counter++ | ||||
|  | @ -5,10 +5,6 @@ include ../_includes/_mixins | |||
| +section("basics") | ||||
|     include _vectors-similarity/_basics | ||||
| 
 | ||||
| +section("in-context") | ||||
|     +h(2, "in-context") Similarities in context | ||||
|     include _vectors-similarity/_in-context | ||||
| 
 | ||||
| +section("custom") | ||||
|     +h(2, "custom") Customising word vectors | ||||
|     include _vectors-similarity/_custom | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user