diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py
index b73dcbf3b..47b20057c 100644
--- a/examples/information_extraction/entity_relations.py
+++ b/examples/information_extraction/entity_relations.py
@@ -1,7 +1,6 @@
#!/usr/bin/env python
# coding: utf8
-"""
-A simple example of extracting relations between phrases and entities using
+"""A simple example of extracting relations between phrases and entities using
spaCy's named entity recognizer and the dependency parse. Here, we extract
money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to – for example:
diff --git a/examples/information_extraction/parse_subtrees.py b/examples/information_extraction/parse_subtrees.py
index 5963d014c..2a258b31d 100644
--- a/examples/information_extraction/parse_subtrees.py
+++ b/examples/information_extraction/parse_subtrees.py
@@ -1,8 +1,7 @@
#!/usr/bin/env python
# coding: utf8
-"""
-This example shows how to navigate the parse tree including subtrees attached
-to a word.
+"""This example shows how to navigate the parse tree including subtrees
+attached to a word.
Based on issue #252:
"In the documents and tutorials the main thing I haven't found is
diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py
index 2dd2691b9..0b5bcdc7f 100644
--- a/examples/information_extraction/phrase_matcher.py
+++ b/examples/information_extraction/phrase_matcher.py
@@ -1,9 +1,10 @@
+#!/usr/bin/env python
+# coding: utf8
"""Match a large set of multi-word expressions in O(1) time.
The idea is to associate each word in the vocabulary with a tag, noting whether
they begin, end, or are inside at least one pattern. An additional tag is used
for single-word patterns. Complete patterns are also stored in a hash set.
-
When we process a document, we look up the words in the vocabulary, to
associate the words with the tags. We then search for tag-sequences that
correspond to valid candidates. Finally, we look up the candidates in the hash
diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py
index 19b1c462a..99bb9c53f 100644
--- a/examples/pipeline/multi_processing.py
+++ b/examples/pipeline/multi_processing.py
@@ -1,5 +1,6 @@
-"""
-Example of multi-processing with Joblib. Here, we're exporting
+#!/usr/bin/env python
+# coding: utf8
+"""Example of multi-processing with Joblib. Here, we're exporting
part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with
each "sentence" on a newline, and spaces between tokens. Data is loaded from
the IMDB movie reviews dataset and will be loaded automatically via Thinc's
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index 499807d23..e95cce4c9 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -1,7 +1,6 @@
#!/usr/bin/env python
# coding: utf8
-"""
-Example of training spaCy's named entity recognizer, starting off with an
+"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.
For more details, see the documentation:
diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py
index ec1e562c6..1c70f7c03 100644
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@@ -1,7 +1,6 @@
#!/usr/bin/env python
# coding: utf8
-"""
-Example of training an additional entity type
+"""Example of training an additional entity type
This script shows how to add a new entity type to an existing pre-trained NER
model. To keep the example short and simple, only four sentences are provided
diff --git a/examples/training/train_parser.py b/examples/training/train_parser.py
index a23d73ec7..e321fdb1e 100644
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@@ -1,10 +1,7 @@
#!/usr/bin/env python
# coding: utf8
-"""
-Example of training spaCy dependency parser, starting off with an existing model
-or a blank model.
-
-For more details, see the documentation:
+"""Example of training spaCy dependency parser, starting off with an existing
+model or a blank model. For more details, see the documentation:
* Training: https://alpha.spacy.io/usage/training
* Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse
diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
index c6fc1de88..7508c2e66 100644
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@@ -3,9 +3,8 @@
"""
A simple example for training a part-of-speech tagger with a custom tag map.
To allow us to update the tag map with our custom one, this example starts off
-with a blank Language class and modifies its defaults.
-
-For more details, see the documentation:
+with a blank Language class and modifies its defaults. For more details, see
+the documentation:
* Training: https://alpha.spacy.io/usage/training
* POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging
diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py
index 1f9cd29aa..fc9610a66 100644
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@@ -3,9 +3,8 @@
"""Train a multi-label convolutional neural network text classifier on the
IMDB dataset, using the TextCategorizer component. The dataset will be loaded
automatically via Thinc's built-in dataset loader. The model is added to
-spacy.pipeline, and predictions are available via `doc.cats`.
-
-For more details, see the documentation:
+spacy.pipeline, and predictions are available via `doc.cats`. For more details,
+see the documentation:
* Training: https://alpha.spacy.io/usage/training
* Text classification: https://alpha.spacy.io/usage/text-classification
diff --git a/examples/vectors_fast_text.py b/examples/vectors_fast_text.py
index 159250098..5b763fe0a 100644
--- a/examples/vectors_fast_text.py
+++ b/examples/vectors_fast_text.py
@@ -7,14 +7,13 @@ from __future__ import unicode_literals
import plac
import numpy
-import from spacy.language import Language
+from spacy.language import Language
@plac.annotations(
vectors_loc=("Path to vectors", "positional", None, str))
def main(vectors_loc):
- nlp = Language()
-
+ nlp = Language() # start off with a blank Language class
with open(vectors_loc, 'rb') as file_:
header = file_.readline()
nr_row, nr_dim = header.split()
@@ -24,9 +23,11 @@ def main(vectors_loc):
pieces = line.split()
word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
- nlp.vocab.set_vector(word, vector)
- doc = nlp(u'class colspan')
- print(doc[0].similarity(doc[1]))
+ nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
+ # test the vectors and similarity
+ text = 'class colspan'
+ doc = nlp(text)
+ print(text, doc[0].similarity(doc[1]))
if __name__ == '__main__':
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 3157ba99d..5ee8a2b1e 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -99,7 +99,8 @@ def generate_meta(model_path, existing_meta):
nlp = util.load_model_from_path(Path(model_path))
meta['pipeline'] = nlp.pipe_names
meta['vectors'] = {'width': nlp.vocab.vectors_length,
- 'entries': len(nlp.vocab.vectors)}
+ 'vectors': len(nlp.vocab.vectors),
+ 'keys': nlp.vocab.vectors.n_keys}
prints("Enter the package settings for your model. The following "
"information will be read from your model data: pipeline, vectors.",
title="Generating meta.json")
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 74e1d6d68..f489ba7bf 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -146,7 +146,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,
'gpu': gpu_wps}
meta['vectors'] = {'width': nlp.vocab.vectors_length,
- 'entries': len(nlp.vocab.vectors)}
+ 'vectors': len(nlp.vocab.vectors),
+ 'keys': nlp.vocab.vectors.n_keys}
meta['lang'] = nlp.lang
meta['pipeline'] = pipeline
meta['spacy_version'] = '>=%s' % about.__version__
diff --git a/spacy/language.py b/spacy/language.py
index 1ce74b265..806172f36 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -155,7 +155,8 @@ class Language(object):
self._meta.setdefault('url', '')
self._meta.setdefault('license', '')
self._meta['vectors'] = {'width': self.vocab.vectors_length,
- 'entries': len(self.vocab.vectors)}
+ 'vectors': len(self.vocab.vectors),
+ 'keys': self.vocab.vectors.n_keys}
self._meta['pipeline'] = self.pipe_names
return self._meta
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index a77fb2236..a96913109 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -184,17 +184,18 @@ cdef class Vectors:
yield key, self.data[row]
def find(self, *, key=None, keys=None, row=None, rows=None):
- '''Lookup one or more keys by row, or vice versa.
+ """Look up one or more keys by row, or vice versa.
key (unicode / int): Find the row that the given key points to.
Returns int, -1 if missing.
- keys (sequence): Find rows that the keys point to.
+ keys (iterable): Find rows that the keys point to.
Returns ndarray.
row (int): Find the first key that point to the row.
Returns int.
- rows (sequence): Find the first keys that points to the rows.
+ rows (iterable): Find the keys that point to the rows.
Returns ndarray.
- '''
+ RETURNS: The requested key, keys, row or rows.
+ """
if sum(arg is None for arg in (key, keys, row, rows)) != 3:
raise ValueError("One (and only one) keyword arg must be set.")
xp = get_array_module(self.data)
diff --git a/website/api/vectors.jade b/website/api/vectors.jade
index 692bd1ca8..9685188c5 100644
--- a/website/api/vectors.jade
+++ b/website/api/vectors.jade
@@ -5,46 +5,47 @@ include ../_includes/_mixins
p
| Vectors data is kept in the #[code Vectors.data] attribute, which should
| be an instance of #[code numpy.ndarray] (for CPU vectors) or
- | #[code cupy.ndarray] (for GPU vectors).
+ | #[code cupy.ndarray] (for GPU vectors). Multiple keys can be mapped to
+ | the same vector, and not all of the rows in the table need to be
+ | assigned – so #[code vectors.n_keys] may be greater or smaller than
+ | #[code vectors.shape[0]].
+h(2, "init") Vectors.__init__
+tag method
p
- | Create a new vector store. To keep the vector table empty, pass
- | #[code width=0]. You can also create the vector table and add
- | vectors one by one, or set the vector values directly on initialisation.
+ | Create a new vector store. You can set the vector values and keys
+ | directly on initialisation, or supply a #[code shape] keyword argument
+ | to create an empty table you can add vectors to later.
+aside-code("Example").
from spacy.vectors import Vectors
- from spacy.strings import StringStore
- empty_vectors = Vectors(StringStore())
+ empty_vectors = Vectors(shape=(10000, 300))
- vectors = Vectors([u'cat'], width=300)
- vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,))
-
- vector_table = numpy.zeros((3, 300), dtype='f')
- vectors = Vectors(StringStore(), data=vector_table)
+ data = numpy.zeros((3, 300), dtype='f')
+ keys = [u'cat', u'dog', u'rat']
+ vectors = Vectors(data=data, keys=keys)
+table(["Name", "Type", "Description"])
- +row
- +cell #[code strings]
- +cell #[code StringStore] or list
- +cell
- | List of strings, or a #[+api("stringstore") #[code StringStore]]
- | that maps strings to hash values, and vice versa.
-
- +row
- +cell #[code width]
- +cell int
- +cell Number of dimensions.
-
+row
+cell #[code data]
- +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+ +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+cell The vector data.
+ +row
+ +cell #[code keys]
+ +cell iterable
+ +cell A sequence of keys aligned with the data.
+
+ +row
+ +cell #[code shape]
+ +cell tuple
+ +cell
+ | Size of the table as #[code (n_entries, n_columns)], the number
+ | of entries and number of columns. Not required if you're
+ | initialising the object with #[code data] and #[code keys].
+
+row("foot")
+cell returns
+cell #[code Vectors]
@@ -54,97 +55,92 @@ p
+tag method
p
- | Get a vector by key. If key is a string, it is hashed to an integer ID
- | using the #[code Vectors.strings] table. If the integer key is not found
- | in the table, a #[code KeyError] is raised.
+ | Get a vector by key. If the key is not found in the table, a
+ | #[code KeyError] is raised.
+aside-code("Example").
- vectors = Vectors(StringStore(), 300)
- vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
- cat_vector = vectors[u'cat']
+ cat_id = nlp.vocab.strings[u'cat']
+ cat_vector = nlp.vocab.vectors[cat_id]
+ assert cat_vector == nlp.vocab[u'cat'].vector
+table(["Name", "Type", "Description"])
+row
+cell #[code key]
- +cell unicode / int
+ +cell int
+cell The key to get the vector for.
+row
+cell returns
- +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+ +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+cell The vector for the key.
+h(2, "setitem") Vectors.__setitem__
+tag method
p
- | Set a vector for the given key. If key is a string, it is hashed to an
- | integer ID using the #[code Vectors.strings] table.
+ | Set a vector for the given key.
+aside-code("Example").
- vectors = Vectors(StringStore(), 300)
- vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,))
+ cat_id = nlp.vocab.strings[u'cat']
+ vector = numpy.random.uniform(-1, 1, (300,))
+ nlp.vocab.vectors[cat_id] = vector
+table(["Name", "Type", "Description"])
+row
+cell #[code key]
- +cell unicode / int
+ +cell int
+cell The key to set the vector for.
+row
+cell #[code vector]
- +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+ +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+cell The vector to set.
+h(2, "iter") Vectors.__iter__
+tag method
-p Yield vectors from the table.
+p Iterate over the keys in the table.
+aside-code("Example").
- vector_table = numpy.zeros((3, 300), dtype='f')
- vectors = Vectors(StringStore(), vector_table)
- for vector in vectors:
- print(vector)
+ for key in nlp.vocab.vectors:
+ print(key, nlp.vocab.strings[key])
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
- +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
- +cell A vector from the table.
+ +cell int
+ +cell A key in the table.
+h(2, "len") Vectors.__len__
+tag method
-p Return the number of vectors that have been assigned.
+p Return the number of vectors in the table.
+aside-code("Example").
- vector_table = numpy.zeros((3, 300), dtype='f')
- vectors = Vectors(StringStore(), vector_table)
+ vectors = Vectors(shape=(3, 300))
assert len(vectors) == 3
+table(["Name", "Type", "Description"])
+row("foot")
+cell returns
+cell int
- +cell The number of vectors in the data.
+ +cell The number of vectors in the table.
+h(2, "contains") Vectors.__contains__
+tag method
p
- | Check whether a key has a vector entry in the table. If key is a string,
- | it is hashed to an integer ID using the #[code Vectors.strings] table.
+ | Check whether a key has been mapped to a vector entry in the table.
+aside-code("Example").
- vectors = Vectors(StringStore(), 300)
- vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
- assert u'cat' in vectors
+ cat_id = nlp.vocab.strings[u'cat']
+ nlp.vectors.add(cat_id, numpy.random.uniform(-1, 1, (300,)))
+ assert cat_id in vectors
+table(["Name", "Type", "Description"])
+row
+cell #[code key]
- +cell unicode / int
+ +cell int
+cell The key to check.
+row("foot")
@@ -156,13 +152,20 @@ p
+tag method
p
- | Add a key to the table, optionally setting a vector value as well. If
- | key is a string, it is hashed to an integer ID using the
- | #[code Vectors.strings] table.
+ | Add a key to the table, optionally setting a vector value as well. Keys
+ | can be mapped to an existing vector by setting #[code row], or a new
+ | vector can be added. When adding unicode keys, keep in mind that the
+ | #[code Vectors] class itself has no
+ | #[+api("stringstore") #[code StringStore]], so you have to store the
+ | hash-to-string mapping separately. If you need to manage the strings,
+ | you should use the #[code Vectors] via the
+ | #[+api("vocab") #[code Vocab]] class, e.g. #[code vocab.vectors].
+aside-code("Example").
- vectors = Vectors(StringStore(), 300)
- vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
+ vector = numpy.random.uniform(-1, 1, (300,))
+ cat_id = nlp.vocab.strings[u'cat']
+ nlp.vocab.vectors.add(cat_id, vector=vector)
+ nlp.vocab.vectors.add(u'dog', row=0)
+table(["Name", "Type", "Description"])
+row
@@ -172,25 +175,66 @@ p
+row
+cell #[code vector]
- +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
- +cell An optional vector to add.
+ +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+ +cell An optional vector to add for the key.
+
+ +row
+ +cell #[code row]
+ +cell int
+ +cell An optional row number of a vector to map the key to.
+
+ +row("foot")
+ +cell returns
+ +cell int
+ +cell The row the vector was added to.
+
++h(2, "keys") Vectors.keys
+ +tag method
+
+p A sequence of the keys in the table.
+
++aside-code("Example").
+ for key in nlp.vocab.vectors.keys():
+ print(key, nlp.vocab.strings[key])
+
++table(["Name", "Type", "Description"])
+ +row("foot")
+ +cell returns
+ +cell iterable
+ +cell The keys.
+
++h(2, "values") Vectors.values
+ +tag method
+
+p
+ | Iterate over vectors that have been assigned to at least one key. Note
+ | that some vectors may be unassigned, so the number of vectors returned
+ | may be less than the length of the vectors table.
+
++aside-code("Example").
+ for vector in nlp.vocab.vectors.values():
+ print(vector)
+
++table(["Name", "Type", "Description"])
+ +row("foot")
+ +cell yields
+ +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+ +cell A vector in the table.
+h(2, "items") Vectors.items
+tag method
-p Iterate over #[code (string key, vector)] pairs, in order.
+p Iterate over #[code (key, vector)] pairs, in order.
+aside-code("Example").
- vectors = Vectors(StringStore(), 300)
- vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
- for key, vector in vectors.items():
- print(key, vector)
+ for key, vector in nlp.vocab.vectors.items():
+ print(key, nlp.vocab.strings[key], vector)
+table(["Name", "Type", "Description"])
+row("foot")
+cell yields
+cell tuple
- +cell #[code (string key, vector)] pairs, in order.
+ +cell #[code (key, vector)] pairs, in order.
+h(2, "shape") Vectors.shape
+tag property
@@ -200,7 +244,7 @@ p
| dimensions in the vector table.
+aside-code("Example").
- vectors = Vectors(StringStore(), 300)
+ vectors = Vectors(shape(1, 300))
vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
rows, dims = vectors.shape
assert rows == 1
@@ -212,6 +256,59 @@ p
+cell tuple
+cell A #[code (rows, dims)] pair.
++h(2, "size") Vectors.size
+ +tag property
+
+p The vector size, i.e. #[code rows * dims].
+
++aside-code("Example").
+ vectors = Vectors(shape=(500, 300))
+ assert vectors.size == 150000
+
++table(["Name", "Type", "Description"])
+ +row("foot")
+ +cell returns
+ +cell int
+ +cell The vector size.
+
++h(2, "is_full") Vectors.is_full
+ +tag property
+
+p
+ | Whether the vectors table is full and has no slots are available for new
+ | keys. If a table is full, it can be resized using
+ | #[+api("vectors#resize") #[code Vectors.resize]].
+
++aside-code("Example").
+ vectors = Vectors(shape=(1, 300))
+ vectors.add(u'cat', numpy.random.uniform(-1, 1, (300,)))
+ assert vectors.is_full
+
++table(["Name", "Type", "Description"])
+ +row("foot")
+ +cell returns
+ +cell bool
+ +cell Whether the vectors table is full.
+
++h(2, "n_keys") Vectors.n_keys
+ +tag property
+
+p
+ | Get the number of keys in the table. Note that this is the number of
+ | #[em all] keys, not just unique vectors. If several keys are mapped
+ | are mapped to the same vectors, they will be counted individually.
+
++aside-code("Example").
+ vectors = Vectors(shape=(10, 300))
+ assert len(vectors) == 10
+ assert vectors.n_keys == 0
+
++table(["Name", "Type", "Description"])
+ +row("foot")
+ +cell returns
+ +cell int
+ +cell The number of all keys in the table.
+
+h(2, "from_glove") Vectors.from_glove
+tag method
@@ -223,6 +320,10 @@ p
| float32 vectors, #[code vectors.300.d.bin] for 300d float64 (double)
| vectors, etc. By default GloVe outputs 64-bit vectors.
++aside-code("Example").
+ vectors = Vectors()
+ vectors.from_glove('/path/to/glove_vectors')
+
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
@@ -323,7 +424,7 @@ p Load state from a binary string.
+table(["Name", "Type", "Description"])
+row
+cell #[code data]
- +cell #[code numpy.ndarray] / #[code cupy.ndarray]
+ +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+cell
| Stored vectors data. #[code numpy] is used for CPU vectors,
| #[code cupy] for GPU vectors.
@@ -337,7 +438,7 @@ p Load state from a binary string.
+row
+cell #[code keys]
- +cell #[code numpy.ndarray]
+ +cell #[code.u-break ndarray[ndim=1, dtype='float32']]
+cell
| Array keeping the keys in order, such that
| #[code keys[vectors.key2row[key]] == key]
diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass
index 8c1e82706..9b1c0cedc 100644
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@@ -47,7 +47,7 @@
font: 600 1.1rem/#{1} $font-secondary
background: $color-theme
color: $color-back
- padding: 0.15em 0.5em 0.35em
+ padding: 2px 6px 4px
border-radius: 1em
text-transform: uppercase
vertical-align: middle
diff --git a/website/assets/js/models.js b/website/assets/js/models.js
index e79073edd..8df49acc2 100644
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@@ -1,6 +1,6 @@
'use strict';
-import { Templater, handleResponse, convertNumber } from './util.js';
+import { Templater, handleResponse, convertNumber, abbrNumber } from './util.js';
/**
* Chart.js defaults
@@ -25,7 +25,7 @@ export const formats = {
license: (license, url) => url ? `${license}` : license,
sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `${p}
`).join(', ') : '-',
- vectors: vec => vec ? `${convertNumber(vec.entries)} (${vec.width} dimensions)` : 'n/a',
+ vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a',
version: version => `v${version}
`
};
@@ -240,7 +240,8 @@ export class ModelComparer {
return data;
}
- showError() {
+ showError(err) {
+ console.error(err);
this.tpl.get('result').style.display = 'none';
this.tpl.get('error').style.display = 'block';
}
diff --git a/website/assets/js/util.js b/website/assets/js/util.js
index 6bf14f578..65d05774c 100644
--- a/website/assets/js/util.js
+++ b/website/assets/js/util.js
@@ -46,11 +46,24 @@ export const handleResponse = res => {
else return ({ ok: res.ok })
};
-
/**
* Convert a number to a string and add thousand separator.
* @param {number|string} num - The number to convert.
* @param {string} separator – Thousand separator.
*/
-export const convertNumber = (num, separator = ',') =>
+export const convertNumber = (num = 0, separator = ',') =>
num.toString().replace(/\B(?=(\d{3})+(?!\d))/g, separator);
+
+/**
+ * Abbreviate a number, e.g. 14249930 --> 14.25m.
+ * @param {number|string} num - The number to convert.
+ * @param {number} fixed - Number of decimals.
+ */
+export const abbrNumber = (num = 0, fixed = 2) => {
+ const suffixes = ['', 'k', 'm', 'b', 't'];
+ if (num === null || num === 0) return 0;
+ const b = num.toPrecision(2).split('e');
+ const k = (b.length === 1) ? 0 : Math.floor(Math.min(b[1].slice(1), 14) / 3);
+ const c = (k < 1) ? num.toFixed(fixed) : (num / Math.pow(10, k * 3)).toFixed(fixed + 1);
+ return (c < 0 ? c : Math.abs(c)) + suffixes[k];
+}
diff --git a/website/models/_data.json b/website/models/_data.json
index 959d73133..cb971e20c 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -100,6 +100,7 @@
"hu": "Hungarian",
"pl": "Polish",
"he": "Hebrew",
+ "ga": "Irish",
"bn": "Bengali",
"hi": "Hindi",
"id": "Indonesian",
@@ -114,6 +115,8 @@
"de": "Dies ist ein Satz.",
"fr": "C'est une phrase.",
"es": "Esto es una frase.",
+ "pt": "Esta é uma frase.",
+ "it": "Questa è una frase.",
"xx": "This is a sentence about Facebook."
}
}
diff --git a/website/usage/_data.json b/website/usage/_data.json
index 4a4e6df01..498202695 100644
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@@ -116,7 +116,6 @@
"next": "text-classification",
"menu": {
"Basics": "basics",
- "Similarity in Context": "in-context",
"Custom Vectors": "custom",
"GPU Usage": "gpu"
}
diff --git a/website/usage/_install/_quickstart.jade b/website/usage/_install/_quickstart.jade
index 8e581994c..b4ee10c4b 100644
--- a/website/usage/_install/_quickstart.jade
+++ b/website/usage/_install/_quickstart.jade
@@ -19,6 +19,7 @@
+qs({package: 'source'}) git clone https://github.com/explosion/spaCy
+qs({package: 'source'}) cd spaCy
+ +qs({package: 'source'}) export PYTHONPATH=`pwd`
+qs({package: 'source'}) pip install -r requirements.txt
+qs({package: 'source'}) pip install -e .
diff --git a/website/usage/_models/_languages.jade b/website/usage/_models/_languages.jade
index 4337b5b99..7163d8448 100644
--- a/website/usage/_models/_languages.jade
+++ b/website/usage/_models/_languages.jade
@@ -46,7 +46,6 @@ p
+item #[strong Chinese]: #[+a("https://github.com/fxsjy/jieba") Jieba]
+item #[strong Japanese]: #[+a("https://github.com/mocobeta/janome") Janome]
+item #[strong Thai]: #[+a("https://github.com/wannaphongcom/pythainlp") pythainlp]
- +item #[strong Russian]: #[+a("https://github.com/kmike/pymorphy2") pymorphy2]
+h(3, "multi-language") Multi-language support
+tag-new(2)
diff --git a/website/usage/_vectors-similarity/_basics.jade b/website/usage/_vectors-similarity/_basics.jade
index b8f8d834c..300680331 100644
--- a/website/usage/_vectors-similarity/_basics.jade
+++ b/website/usage/_vectors-similarity/_basics.jade
@@ -13,3 +13,127 @@
include ../_spacy-101/_similarity
include ../_spacy-101/_word-vectors
+
++h(3, "in-context") Similarities in context
+
+p
+ | Aside from spaCy's built-in word vectors, which were trained on a lot of
+ | text with a wide vocabulary, the parsing, tagging and NER models also
+ | rely on vector representations of the #[strong meanings of words in context].
+ | As the first component of the
+ | #[+a("/usage/processing-pipelines") processing pipeline], the
+ | tensorizer encodes a document's internal meaning representations as an
+ | array of floats, also called a tensor. This allows spaCy to make a
+ | reasonable guess at a word's meaning, based on its surrounding words.
+ | Even if a word hasn't been seen before, spaCy will know #[em something]
+ | about it. Because spaCy uses a 4-layer convolutional network, the
+ | tensors are sensitive to up to #[strong four words on either side] of a
+ | word.
+
+p
+ | For example, here are three sentences containing the out-of-vocabulary
+ | word "labrador" in different contexts.
+
++code.
+ doc1 = nlp(u"The labrador barked.")
+ doc2 = nlp(u"The labrador swam.")
+ doc3 = nlp(u"the labrador people live in canada.")
+
+ for doc in [doc1, doc2, doc3]:
+ labrador = doc[1]
+ dog = nlp(u"dog")
+ print(labrador.similarity(dog))
+
+p
+ | Even though the model has never seen the word "labrador", it can make a
+ | fairly accurate prediction of its similarity to "dog" in different
+ | contexts.
+
++table(["Context", "labrador.similarity(dog)"])
+ +row
+ +cell The #[strong labrador] barked.
+ +cell #[code 0.56] #[+procon("yes", "similar")]
+
+ +row
+ +cell The #[strong labrador] swam.
+ +cell #[code 0.48] #[+procon("no", "dissimilar")]
+
+ +row
+ +cell the #[strong labrador] people live in canada.
+ +cell #[code 0.39] #[+procon("no", "dissimilar")]
+
+p
+ | The same also works for whole documents. Here, the variance of the
+ | similarities is lower, as all words and their order are taken into
+ | account. However, the context-specific similarity is often still
+ | reflected pretty accurately.
+
++code.
+ doc1 = nlp(u"Paris is the largest city in France.")
+ doc2 = nlp(u"Vilnius is the capital of Lithuania.")
+ doc3 = nlp(u"An emu is a large bird.")
+
+ for doc in [doc1, doc2, doc3]:
+ for other_doc in [doc1, doc2, doc3]:
+ print(doc.similarity(other_doc))
+
+p
+ | Even though the sentences about Paris and Vilnius consist of different
+ | words and entities, they both describe the same concept and are seen as
+ | more similar than the sentence about emus. In this case, even a misspelled
+ | version of "Vilnius" would still produce very similar results.
+
++table
+ - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]}
+ - var counter = 0
+
+ +row
+ +row
+ +cell
+ for _, label in examples
+ +cell=label
+
+ each cells, label in examples
+ +row(counter ? null : "divider")
+ +cell=label
+ for cell in cells
+ +cell.u-text-center
+ - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
+ | #[code=cell.toFixed(2)] #[+procon(...result)]
+ - counter++
+
+p
+ | Sentences that consist of the same words in different order will likely
+ | be seen as very similar – but never identical.
+
++code.
+ docs = [nlp(u"dog bites man"), nlp(u"man bites dog"),
+ nlp(u"man dog bites"), nlp(u"dog man bites")]
+
+ for doc in docs:
+ for other_doc in docs:
+ print(doc.similarity(other_doc))
+
+p
+ | Interestingly, "man bites dog" and "man dog bites" are seen as slightly
+ | more similar than "man bites dog" and "dog bites man". This may be a
+ | conincidence – or the result of "man" being interpreted as both sentence's
+ | subject.
+
++table
+ - var examples = {"dog bites man": [1, 0.9, 0.89, 0.92], "man bites dog": [0.9, 1, 0.93, 0.9], "man dog bites": [0.89, 0.93, 1, 0.92], "dog man bites": [0.92, 0.9, 0.92, 1]}
+ - var counter = 0
+
+ +row("head")
+ +cell
+ for _, label in examples
+ +cell.u-text-center=label
+
+ each cells, label in examples
+ +row(counter ? null : "divider")
+ +cell=label
+ for cell in cells
+ +cell.u-text-center
+ - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
+ | #[code=cell.toFixed(2)] #[+procon(...result)]
+ - counter++
diff --git a/website/usage/_vectors-similarity/_custom.jade b/website/usage/_vectors-similarity/_custom.jade
index da4be39fd..7792949d1 100644
--- a/website/usage/_vectors-similarity/_custom.jade
+++ b/website/usage/_vectors-similarity/_custom.jade
@@ -1,49 +1,137 @@
//- 💫 DOCS > USAGE > VECTORS & SIMILARITY > CUSTOM VECTORS
p
- | By default, #[+api("token#vector") #[code Token.vector]] returns the
- | vector for its underlying #[+api("lexeme") #[code Lexeme]], while
- | #[+api("doc#vector") #[code Doc.vector]] and
- | #[+api("span#vector") #[code Span.vector]] return an average of the
- | vectors of their tokens. You can customize these
- | behaviours by modifying the #[code doc.user_hooks],
- | #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
- | dictionaries.
+ | Word vectors let you import knowledge from raw text into your model. The
+ | knowledge is represented as a table of numbers, with one row per term in
+ | your vocabulary. If two terms are used in similar contexts, the algorithm
+ | that learns the vectors should assign them
+ | #[strong rows that are quite similar], while words that are used in
+ | different contexts will have quite different values. This lets you use
+ | the row-values assigned to the words as a kind of dictionary, to tell you
+ | some things about what the words in your text mean.
-+infobox
- | For more details on #[strong adding hooks] and #[strong overwriting] the
- | built-in #[code Doc], #[code Span] and #[code Token] methods, see the
- | usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks].
+p
+ | Word vectors are particularly useful for terms which
+ | #[strong aren't well represented in your labelled training data].
+ | For instance, if you're doing named entity recognition, there will always
+ | be lots of names that you don't have examples of. For instance, imagine
+ | your training data happens to contain some examples of the term
+ | "Microsoft", but it doesn't contain any examples of the term "Symantec".
+ | In your raw text sample, there are plenty of examples of both terms, and
+ | they're used in similar contexts. The word vectors make that fact
+ | available to the entity recognition model. It still won't see examples of
+ | "Symantec" labelled as a company. However, it'll see that "Symantec" has
+ | a word vector that usually corresponds to company terms, so it can
+ | #[strong make the inference].
+
+p
+ | In order to make best use of the word vectors, you want the word vectors
+ | table to cover a #[strong very large vocabulary]. However, most words are
+ | rare, so most of the rows in a large word vectors table will be accessed
+ | very rarely, or never at all. You can usually cover more than
+ | #[strong 95% of the tokens] in your corpus with just
+ | #[strong a few thousand rows] in the vector table. However, it's those
+ | #[strong 5% of rare terms] where the word vectors are
+ | #[strong most useful]. The problem is that increasing the size of the
+ | vector table produces rapidly diminishing returns in coverage over these
+ | rare terms.
+
++h(3, "custom-vectors-coverage") Optimising vector coverage
+ +tag-new(2)
+
+p
+ | To help you strike a good balance between coverage and memory usage,
+ | spaCy's #[+api("vectors") #[code Vectors]] class lets you map
+ | #[strong multiple keys] to the #[strong same row] of the table. If
+ | you're using the #[+api("cli#vocab") #[code spacy vocab]] command to
+ | create a vocabulary, pruning the vectors will be taken care of
+ | automatically. You can also do it manually in the following steps:
+
++list("numbers")
+ +item
+ | Start with a #[strong word vectors model] that covers a huge
+ | vocabulary. For instance, the
+ | #[+a("/models/en#en_vectors_web_lg") #[code en_vectors_web_lg]] model
+ | provides 300-dimensional GloVe vectors for over 1 million terms of
+ | English.
+
+ +item
+ | If your vocabulary has values set for the #[code Lexeme.prob]
+ | attribute, the lexemes will be sorted by descending probability to
+ | determine which vectors to prune. Otherwise, lexemes will be sorted
+ | by their order in the #[code Vocab].
+
+ +item
+ | Call #[+api("vocab#prune_vectors") #[code Vocab.prune_vectors]] with
+ | the number of vectors you want to keep.
+
++code.
+ nlp = spacy.load('en_vectors_web_lg')
+ n_vectors = 105000 # number of vectors to keep
+ removed_words = nlp.vocab.prune_vectors(n_vectors)
+
+ assert len(nlp.vocab.vectors) <= n_vectors # unique vectors have been pruned
+ assert nlp.vocab.vectors.n_keys > n_vectors # but not the total entries
+
+p
+ | #[+api("vocab#prune_vectors") #[code Vocab.prune_vectors]] reduces the
+ | current vector table to a given number of unique entries, and returns a
+ | dictionary containing the removed words, mapped to #[code (string, score)]
+ | tuples, where #[code string] is the entry the removed word was mapped
+ | to, and #[code score] the similarity score between the two words.
+
++code("Removed words").
+ {
+ 'Shore': ('coast', 0.732257),
+ 'Precautionary': ('caution', 0.490973),
+ 'hopelessness': ('sadness', 0.742366),
+ 'Continous': ('continuous', 0.732549),
+ 'Disemboweled': ('corpse', 0.499432),
+ 'biostatistician': ('scientist', 0.339724),
+ 'somewheres': ('somewheres', 0.402736),
+ 'observing': ('observe', 0.823096),
+ 'Leaving': ('leaving', 1.0)
+ }
+
+p
+ | In the example above, the vector for "Shore" was removed and remapped
+ | to the vector of "coast", which is deemed about 73% similar. "Leaving"
+ | was remapped to the vector of "leaving", which is identical.
+h(3, "custom-vectors-add") Adding vectors
+tag-new(2)
p
- | The new #[+api("vectors") #[code Vectors]] class makes it easy to add
- | your own vectors to spaCy. Just like the #[+api("vocab") #[code Vocab]],
- | it is initialised with a #[+api("stringstore") #[code StringStore]] or
- | a list of strings.
+ | spaCy's new #[+api("vectors") #[code Vectors]] class greatly improves the
+ | way word vectors are stored, accessed and used. The data is stored in
+ | two structures:
-+code("Adding vectors one-by-one").
- from spacy.strings import StringStore
- from spacy.vectors import Vectors
++list
+ +item
+ | An array, which can be either on CPU or #[+a("#gpu") GPU].
- vector_data = {'dog': numpy.random.uniform(-1, 1, (300,)),
- 'cat': numpy.random.uniform(-1, 1, (300,)),
- 'orange': numpy.random.uniform(-1, 1, (300,))}
-
- vectors = Vectors(StringStore(), 300)
- for word, vector in vector_data.items():
- vectors.add(word, vector)
+ +item
+ | A dictionary mapping string-hashes to rows in the table.
p
- | You can also add the vector values directly on initialisation:
+ | Keep in mind that the #[code Vectors] class itself has no
+ | #[+api("stringstore") #[code StringStore]], so you have to store the
+ | hash-to-string mapping separately. If you need to manage the strings,
+ | you should use the #[code Vectors] via the
+ | #[+api("vocab") #[code Vocab]] class, e.g. #[code vocab.vectors]. To
+ | add vectors to the vocabulary, you can use the
+ | #[+api("vocab#set_vector") #[code Vocab.set_vector]] method.
-+code("Adding vectors on initialisation").
- from spacy.vectors import Vectors
++code("Adding vectors").
+ from spacy.vocab import Vocab
- vector_table = numpy.zeros((3, 300), dtype='f')
- vectors = Vectors([u'dog', u'cat', u'orange'], vector_table)
+ vector_data = {u'dog': numpy.random.uniform(-1, 1, (300,)),
+ u'cat': numpy.random.uniform(-1, 1, (300,)),
+ u'orange': numpy.random.uniform(-1, 1, (300,))}
+
+ vocab = Vocab()
+ for word, vector in vector_data.items():
+ vocab.set_vector(word, vector)
+h(3, "custom-loading-glove") Loading GloVe vectors
+tag-new(2)
@@ -89,3 +177,20 @@ p
| #[+api("vocab#set_vector") #[code set_vector]] method.
+github("spacy", "examples/vectors_fast_text.py")
+
++h(3, "custom-similarity") Using custom similarity methods
+
+p
+ | By default, #[+api("token#vector") #[code Token.vector]] returns the
+ | vector for its underlying #[+api("lexeme") #[code Lexeme]], while
+ | #[+api("doc#vector") #[code Doc.vector]] and
+ | #[+api("span#vector") #[code Span.vector]] return an average of the
+ | vectors of their tokens. You can customise these
+ | behaviours by modifying the #[code doc.user_hooks],
+ | #[code doc.user_span_hooks] and #[code doc.user_token_hooks]
+ | dictionaries.
+
++infobox
+ | For more details on #[strong adding hooks] and #[strong overwriting] the
+ | built-in #[code Doc], #[code Span] and #[code Token] methods, see the
+ | usage guide on #[+a("/usage/processing-pipelines#user-hooks") user hooks].
diff --git a/website/usage/_vectors-similarity/_in-context.jade b/website/usage/_vectors-similarity/_in-context.jade
deleted file mode 100644
index becd74348..000000000
--- a/website/usage/_vectors-similarity/_in-context.jade
+++ /dev/null
@@ -1,123 +0,0 @@
-//- 💫 DOCS > USAGE > VECTORS & SIMILARITY > IN CONTEXT
-
-p
- | Aside from spaCy's built-in word vectors, which were trained on a lot of
- | text with a wide vocabulary, the parsing, tagging and NER models also
- | rely on vector representations of the #[strong meanings of words in context].
- | As the first component of the
- | #[+a("/usage/processing-pipelines") processing pipeline], the
- | tensorizer encodes a document's internal meaning representations as an
- | array of floats, also called a tensor. This allows spaCy to make a
- | reasonable guess at a word's meaning, based on its surrounding words.
- | Even if a word hasn't been seen before, spaCy will know #[em something]
- | about it. Because spaCy uses a 4-layer convolutional network, the
- | tensors are sensitive to up to #[strong four words on either side] of a
- | word.
-
-p
- | For example, here are three sentences containing the out-of-vocabulary
- | word "labrador" in different contexts.
-
-+code.
- doc1 = nlp(u"The labrador barked.")
- doc2 = nlp(u"The labrador swam.")
- doc3 = nlp(u"the labrador people live in canada.")
-
- for doc in [doc1, doc2, doc3]:
- labrador = doc[1]
- dog = nlp(u"dog")
- print(labrador.similarity(dog))
-
-p
- | Even though the model has never seen the word "labrador", it can make a
- | fairly accurate prediction of its similarity to "dog" in different
- | contexts.
-
-+table(["Context", "labrador.similarity(dog)"])
- +row
- +cell The #[strong labrador] barked.
- +cell #[code 0.56] #[+procon("yes", "similar")]
-
- +row
- +cell The #[strong labrador] swam.
- +cell #[code 0.48] #[+procon("no", "dissimilar")]
-
- +row
- +cell the #[strong labrador] people live in canada.
- +cell #[code 0.39] #[+procon("no", "dissimilar")]
-
-p
- | The same also works for whole documents. Here, the variance of the
- | similarities is lower, as all words and their order are taken into
- | account. However, the context-specific similarity is often still
- | reflected pretty accurately.
-
-+code.
- doc1 = nlp(u"Paris is the largest city in France.")
- doc2 = nlp(u"Vilnius is the capital of Lithuania.")
- doc3 = nlp(u"An emu is a large bird.")
-
- for doc in [doc1, doc2, doc3]:
- for other_doc in [doc1, doc2, doc3]:
- print(doc.similarity(other_doc))
-
-p
- | Even though the sentences about Paris and Vilnius consist of different
- | words and entities, they both describe the same concept and are seen as
- | more similar than the sentence about emus. In this case, even a misspelled
- | version of "Vilnius" would still produce very similar results.
-
-+table
- - var examples = {"Paris is the largest city in France.": [1, 0.85, 0.65], "Vilnius is the capital of Lithuania.": [0.85, 1, 0.55], "An emu is a large bird.": [0.65, 0.55, 1]}
- - var counter = 0
-
- +row
- +row
- +cell
- for _, label in examples
- +cell=label
-
- each cells, label in examples
- +row(counter ? null : "divider")
- +cell=label
- for cell in cells
- +cell.u-text-center
- - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
- | #[code=cell.toFixed(2)] #[+procon(...result)]
- - counter++
-
-p
- | Sentences that consist of the same words in different order will likely
- | be seen as very similar – but never identical.
-
-+code.
- docs = [nlp(u"dog bites man"), nlp(u"man bites dog"),
- nlp(u"man dog bites"), nlp(u"dog man bites")]
-
- for doc in docs:
- for other_doc in docs:
- print(doc.similarity(other_doc))
-
-p
- | Interestingly, "man bites dog" and "man dog bites" are seen as slightly
- | more similar than "man bites dog" and "dog bites man". This may be a
- | conincidence – or the result of "man" being interpreted as both sentence's
- | subject.
-
-+table
- - var examples = {"dog bites man": [1, 0.9, 0.89, 0.92], "man bites dog": [0.9, 1, 0.93, 0.9], "man dog bites": [0.89, 0.93, 1, 0.92], "dog man bites": [0.92, 0.9, 0.92, 1]}
- - var counter = 0
-
- +row("head")
- +cell
- for _, label in examples
- +cell.u-text-center=label
-
- each cells, label in examples
- +row(counter ? null : "divider")
- +cell=label
- for cell in cells
- +cell.u-text-center
- - var result = cell < 0.7 ? ["no", "dissimilar"] : cell != 1 ? ["yes", "similar"] : ["neutral", "identical"]
- | #[code=cell.toFixed(2)] #[+procon(...result)]
- - counter++
diff --git a/website/usage/vectors-similarity.jade b/website/usage/vectors-similarity.jade
index 1e1139b20..fd70910ae 100644
--- a/website/usage/vectors-similarity.jade
+++ b/website/usage/vectors-similarity.jade
@@ -5,10 +5,6 @@ include ../_includes/_mixins
+section("basics")
include _vectors-similarity/_basics
-+section("in-context")
- +h(2, "in-context") Similarities in context
- include _vectors-similarity/_in-context
-
+section("custom")
+h(2, "custom") Customising word vectors
include _vectors-similarity/_custom