Tidy up Vectors and docs

2025-01-24 00:04:15 +03:00 · 2017-10-27 19:45:19 +02:00 · 2017-10-27 19:45:19 +02:00 · 5167a0cce2
commit 5167a0cce2
parent 7946464742
3 changed files with 151 additions and 102 deletions
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -1,5 +1,6 @@
+# coding: utf8
 from __future__ import unicode_literals
-from libc.stdint cimport int32_t, uint64_t
+
 import numpy
 from collections import OrderedDict
 import msgpack
@ -9,23 +10,20 @@ cimport numpy as np
 from thinc.neural.util import get_array_module
 from thinc.neural._classes.model import Model

-from .typedefs cimport attr_t
 from .strings cimport StringStore
-from . import util
 from .compat import basestring_, path2str
+from . import util


 cdef class Vectors:
-    '''Store, save and load word vectors.
+    """Store, save and load word vectors.

    Vectors data is kept in the vectors.data attribute, which should be an
-    instance of numpy.ndarray (for CPU vectors)
-    or cupy.ndarray (for GPU vectors).
-
-    vectors.key2row is a dictionary mapping word hashes to rows
-    in the vectors.data table. The array `vectors.keys` keeps
-    the keys in order, such that keys[vectors.key2row[key]] == key.
-    '''
+    instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
+    (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
+    rows in the vectors.data table. The array `vectors.keys` keeps the keys in
+    order, such that `keys[vectors.key2row[key]] == key`.
+    """
    cdef public object data
    cdef readonly StringStore strings
    cdef public object key2row
@ -33,6 +31,16 @@ cdef class Vectors:
    cdef public int i

    def __init__(self, strings, width=0, data=None):
+        """Create a new vector store. To keep the vector table empty, pass
+        `width=0`. You can also create the vector table and add vectors one by
+        one, or set the vector values directly on initialisation.
+
+        strings (StringStore or list): List of strings or StringStore that maps
+            strings to hash values, and vice versa.
+        width (int): Number of dimensions.
+        data (numpy.ndarray): The vector data.
+        RETURNS (Vectors): The newly created object.
+        """
        if isinstance(strings, StringStore):
            self.strings = strings
        else:
@ -55,11 +63,13 @@ cdef class Vectors:
        return (Vectors, (self.strings, self.data))

    def __getitem__(self, key):
-        '''Get a vector by key. If key is a string, it is hashed
-        to an integer ID using the vectors.strings table.
+        """Get a vector by key. If key is a string, it is hashed to an integer
+        ID using the vectors.strings table. If the integer key is not found in
+        the table, a KeyError is raised.

-        If the integer key is not found in the table, a KeyError is raised.
-        '''
+        key (unicode / int): The key to get the vector for.
+        RETURNS (numpy.ndarray): The vector for the key.
+        """
        if isinstance(key, basestring):
            key = self.strings[key]
        i = self.key2row[key]
@ -69,30 +79,47 @@ cdef class Vectors:
            return self.data[i]

    def __setitem__(self, key, vector):
-        '''Set a vector for the given key. If key is a string, it is hashed
+        """Set a vector for the given key. If key is a string, it is hashed
        to an integer ID using the vectors.strings table.
-        '''
+
+        key (unicode / int): The key to set the vector for.
+        vector (numpy.ndarray): The vector to set.
+        """
        if isinstance(key, basestring):
            key = self.strings.add(key)
        i = self.key2row[key]
        self.data[i] = vector

    def __iter__(self):
-        '''Yield vectors from the table.'''
+        """Yield vectors from the table.
+
+        YIELDS (numpy.ndarray): A vector.
+        """
        yield from self.data

    def __len__(self):
-        '''Return the number of vectors that have been assigned.'''
+        """Return the number of vectors that have been assigned.
+
+        RETURNS (int): The number of vectors in the data.
+        """
        return self.i

    def __contains__(self, key):
-        '''Check whether a key has a vector entry in the table.'''
+        """Check whether a key has a vector entry in the table.
+
+        key (unicode / int): The key to check.
+        RETURNS (bool): Whether the key has a vector entry.
+        """
        if isinstance(key, basestring_):
            key = self.strings[key]
        return key in self.key2row

    def add(self, key, vector=None):
-        '''Add a key to the table, optionally setting a vector value as well.'''
+        """Add a key to the table, optionally setting a vector value as well.
+
+        key (unicode / int): The key to add.
+        vector (numpy.ndarray): An optional vector to add.
+        """
        if isinstance(key, basestring_):
            key = self.strings.add(key)
        if key not in self.key2row:
@ -110,24 +137,36 @@ cdef class Vectors:
        return i

    def items(self):
-        '''Iterate over (string key, vector) pairs, in order.'''
+        """Iterate over `(string key, vector)` pairs, in order.
+
+        YIELDS (tuple): A key/vector pair.
+        """
        for i, key in enumerate(self.keys):
            string = self.strings[key]
            yield string, self.data[i]

    @property
    def shape(self):
+        """Get `(rows, dims)` tuples of number of rows and number of dimensions
+        in the vector table.
+
+        RETURNS (tuple): A `(rows, dims)` pair.
+        """
        return self.data.shape

    def most_similar(self, key):
+        # TODO: implement
        raise NotImplementedError

    def from_glove(self, path):
-        '''Load GloVe vectors from a directory. Assumes binary format,
+        """Load GloVe vectors from a directory. Assumes binary format,
        that the vocab is in a vocab.txt, and that vectors are named
        vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
        vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
-        By default GloVe outputs 64-bit vectors.'''
+        By default GloVe outputs 64-bit vectors.
+
+        path (unicode / Path): The path to load the GloVe vectors from.
+        """
        path = util.ensure_path(path)
        for name in path.iterdir():
            if name.parts[-1].startswith('vectors'):
@ -150,9 +189,15 @@ cdef class Vectors:
            self.data

    def to_disk(self, path, **exclude):
+        """Save the current state to a directory.
+
+        path (unicode / Path): A path to a directory, which will be created if
+            it doesn't exists. Either a string or a Path-like object.
+        """
        xp = get_array_module(self.data)
        if xp is numpy:
-            save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
+            save_array = lambda arr, file_: xp.save(file_, arr,
+                                                    allow_pickle=False)
        else:
            save_array = lambda arr, file_: xp.save(file_, arr)
        serializers = OrderedDict((
@ -162,6 +207,12 @@ cdef class Vectors:
        return util.to_disk(path, serializers, exclude)

    def from_disk(self, path, **exclude):
+        """Loads state from a directory. Modifies the object in place and
+        returns it.
+
+        path (unicode / Path): Directory path, string or Path-like object.
+        RETURNS (Vectors): The modified object.
+        """
        def load_keys(path):
            if path.exists():
                self.keys = numpy.load(path2str(path))
@ -182,6 +233,11 @@ cdef class Vectors:
        return self

    def to_bytes(self, **exclude):
+        """Serialize the current state to a binary string.
+
+        **exclude: Named attributes to prevent from being serialized.
+        RETURNS (bytes): The serialized form of the `Vectors` object.
+        """
        def serialize_weights():
            if hasattr(self.data, 'to_bytes'):
                return self.data.to_bytes()
@ -194,6 +250,12 @@ cdef class Vectors:
        return util.to_bytes(serializers, exclude)

    def from_bytes(self, data, **exclude):
+        """Load state from a binary string.
+
+        data (bytes): The data to load from.
+        **exclude: Named attributes to prevent from being loaded.
+        RETURNS (Vectors): The `Vectors` object.
+        """
        def deserialize_weights(b):
            if hasattr(self.data, 'from_bytes'):
                self.data.from_bytes()
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,32 +1,23 @@
 # coding: utf8
 from __future__ import unicode_literals

-import bz2
-import ujson
-import re
 import numpy
 import dill

-from libc.string cimport memset, memcpy
-from libc.stdint cimport int32_t
-from libc.math cimport sqrt
-from cymem.cymem cimport Address
 from collections import OrderedDict
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
 from .typedefs cimport attr_t
 from .tokens.token cimport Token
-from .attrs cimport PROB, LANG
+from .attrs cimport PROB, LANG, ORTH, TAG
 from .structs cimport SerializedLexemeC

-from .compat import copy_reg, pickle, basestring_
+from .compat import copy_reg, basestring_
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs
 from .vectors import Vectors
 from . import util
-from . import attrs
-from . import symbols
 from ._ml import link_vectors_to_models


@ -36,23 +27,22 @@ cdef class Vocab:
    C-data that is shared between `Doc` objects.
    """
    def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
-            strings=tuple(), **deprecated_kwargs):
+                 strings=tuple(), **deprecated_kwargs):
        """Create the vocabulary.

-        lex_attr_getters (dict): A dictionary mapping attribute IDs to functions
-            to compute them. Defaults to `None`.
-        tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained
+        lex_attr_getters (dict): A dictionary mapping attribute IDs to
+            functions to compute them. Defaults to `None`.
+        tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained
            parts-of-speech, and optionally morphological attributes.
        lemmatizer (object): A lemmatizer. Defaults to `None`.
        strings (StringStore): StringStore that maps strings to integers, and
            vice versa.
-        RETURNS (Vocab): The newly constructed vocab object.
+        RETURNS (Vocab): The newly constructed object.
        """
        lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
        tag_map = tag_map if tag_map is not None else {}
        if lemmatizer in (None, True, False):
            lemmatizer = Lemmatizer({}, {}, {})
-
        self.mem = Pool()
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
@ -84,19 +74,20 @@ cdef class Vocab:

        The flag_getter function will be called over the words currently in the
        vocab, and then applied to new words as they occur. You'll then be able
-        to access the flag value on each token, using token.check_flag(flag_id).
+        to access the flag value on each token using token.check_flag(flag_id).
        See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
        `Token.check_flag`.

-        flag_getter (callable): A function `f(unicode) -> bool`, to get the flag
-            value.
+        flag_getter (callable): A function `f(unicode) -> bool`, to get the
+            flag value.
        flag_id (int): An integer between 1 and 63 (inclusive), specifying
            the bit at which the flag will be stored. If -1, the lowest
            available bit will be chosen.
        RETURNS (int): The integer ID by which the flag value can be checked.

        EXAMPLE:
-            >>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy'])
+            >>> my_product_getter = lambda text: text in ['spaCy', 'dislaCy']
+            >>> MY_PRODUCT = nlp.vocab.add_flag(my_product_getter)
            >>> doc = nlp(u'I like spaCy')
            >>> assert doc[2].check_flag(MY_PRODUCT) == True
        """
@ -107,9 +98,10 @@ cdef class Vocab:
                    break
            else:
                raise ValueError(
-                    "Cannot find empty bit for new lexical flag. All bits between "
-                    "0 and 63 are occupied. You can replace one by specifying the "
-                    "flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA")
+                    "Cannot find empty bit for new lexical flag. All bits "
+                    "between 0 and 63 are occupied. You can replace one by "
+                    "specifying the flag_id explicitly, e.g. "
+                    "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
        elif flag_id >= 64 or flag_id < 1:
            raise ValueError(
                "Invalid value for flag_id: %d. Flag IDs must be between "
@ -120,9 +112,9 @@ cdef class Vocab:
        return flag_id

    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
-        """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
-        if necessary, using memory acquired from the given pool. If the pool
-        is the lexicon's own memory, the lexeme is saved in the lexicon.
+        """Get a pointer to a `LexemeC` from the lexicon, creating a new
+        `Lexeme` if necessary using memory acquired from the given pool. If the
+        pool is the lexicon's own memory, the lexeme is saved in the lexicon.
        """
        if string == u'':
            return &EMPTY_LEXEME
@ -139,9 +131,9 @@ cdef class Vocab:
            return self._new_lexeme(mem, string)

    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
-        """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
-        if necessary, using memory acquired from the given pool. If the pool
-        is the lexicon's own memory, the lexeme is saved in the lexicon.
+        """Get a pointer to a `LexemeC` from the lexicon, creating a new
+        `Lexeme` if necessary using memory acquired from the given pool. If the
+        pool is the lexicon's own memory, the lexeme is saved in the lexicon.
        """
        if orth == 0:
            return &EMPTY_LEXEME
@ -203,8 +195,8 @@ cdef class Vocab:
        for orth, addr in self._by_orth.items():
            yield Lexeme(self, orth)

-    def __getitem__(self,  id_or_string):
-        """Retrieve a lexeme, given an int ID or a unicode string.  If a
+    def __getitem__(self, id_or_string):
+        """Retrieve a lexeme, given an int ID or a unicode string. If a
        previously unseen unicode string is given, a new lexeme is created and
        stored.

@ -229,13 +221,14 @@ cdef class Vocab:
        cdef int i
        tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
        for i, props in enumerate(substrings):
-            props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
+            props = intify_attrs(props, strings_map=self.strings,
+                                 _do_deprecated=True)
            token = &tokens[i]
            # Set the special tokens up to have arbitrary attributes
-            lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
+            lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
            token.lex = lex
-            if attrs.TAG in props:
-                self.morphology.assign_tag(token, props[attrs.TAG])
+            if TAG in props:
+                self.morphology.assign_tag(token, props[TAG])
            for attr_id, value in props.items():
                Token.set_struct_attr(token, attr_id, value)
                Lexeme.set_struct_attr(lex, attr_id, value)
@ -254,16 +247,13 @@ cdef class Vocab:
        self.vectors = Vectors(self.strings, width=new_dim)

    def get_vector(self, orth):
-        """Retrieve a vector for a word in the vocabulary.
+        """Retrieve a vector for a word in the vocabulary. Words can be looked
+        up by string or int ID. If no vectors data is loaded, ValueError is
+        raised.

-        Words can be looked up by string or int ID.
-
-        RETURNS:
-            A word vector. Size and shape determined by the
-            vocab.vectors instance. Usually, a numpy ndarray
-            of shape (300,) and dtype float32.
-
-        RAISES: If no vectors data is loaded, ValueError is raised.
+        RETURNS (numpy.ndarray): A word vector. Size
+            and shape determined by the `vocab.vectors` instance. Usually, a
+            numpy ndarray of shape (300,) and dtype float32.
        """
        if isinstance(orth, basestring_):
            orth = self.strings.add(orth)
@ -273,21 +263,16 @@ cdef class Vocab:
            return numpy.zeros((self.vectors_length,), dtype='f')

    def set_vector(self, orth, vector):
-        """Set a vector for a word in the vocabulary.
-
-        Words can be referenced by string or int ID.
-
-        RETURNS:
-            None
+        """Set a vector for a word in the vocabulary. Words can be referenced
+        by string or int ID.
        """
        if not isinstance(orth, basestring_):
            orth = self.strings[orth]
        self.vectors.add(orth, vector=vector)

    def has_vector(self, orth):
-        """Check whether a word has a vector. Returns False if no
-        vectors have been loaded. Words can be looked up by string
-        or int ID."""
+        """Check whether a word has a vector. Returns False if no vectors have
+        been loaded. Words can be looked up by string or int ID."""
        if isinstance(orth, basestring_):
            orth = self.strings.add(orth)
        return orth in self.vectors
@ -296,7 +281,7 @@ cdef class Vocab:
        """Save the current state to a directory.

        path (unicode or Path): A path to a directory, which will be created if
-            it doesn't exist. Paths may be either strings or `Path`-like objects.
+            it doesn't exist. Paths may be either strings or Path-like objects.
        """
        path = util.ensure_path(path)
        if not path.exists():
@ -421,16 +406,13 @@ def pickle_vocab(vocab):
    length = vocab.length
    data_dir = vocab.data_dir
    lex_attr_getters = dill.dumps(vocab.lex_attr_getters)
-
    lexemes_data = vocab.lexemes_to_bytes()
-
    return (unpickle_vocab,
-        (sstore, morph, data_dir, lex_attr_getters,
-            lexemes_data, length))
+            (sstore, morph, data_dir, lex_attr_getters, lexemes_data, length))


 def unpickle_vocab(sstore, morphology, data_dir,
-        lex_attr_getters, bytes lexemes_data, int length):
+                   lex_attr_getters, bytes lexemes_data, int length):
    cdef Vocab vocab = Vocab()
    vocab.length = length
    vocab.strings = sstore
@ -450,12 +432,10 @@ class LookupError(Exception):
    @classmethod
    def mismatched_strings(cls, id_, id_string, original_string):
        return cls(
-            "Error fetching a Lexeme from the Vocab. When looking up a string, "
-            "the lexeme returned had an orth ID that did not match the query string. "
-            "This means that the cached lexeme structs are mismatched to the "
-            "string encoding table. The mismatched:\n"
-            "Query string: {query}\n"
-            "Orth cached: {orth_str}\n"
-            "ID of orth: {orth_id}".format(
-                query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
-        )
+            "Error fetching a Lexeme from the Vocab. When looking up a "
+            "string, the lexeme returned had an orth ID that did not match "
+            "the query string. This means that the cached lexeme structs are "
+            "mismatched to the string encoding table. The mismatched:\n"
+            "Query string: {}\n"
+            "Orth cached: {}\n"
+            "Orth ID: {}".format(repr(original_string), repr(id_string), id_))
--- a/website/api/vectors.jade
+++ b/website/api/vectors.jade
@ -36,12 +36,14 @@ p
            |  that maps strings to hash values, and vice versa.

    +row
-        +cell #[code data]
-        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code width]
+        +cell int
+        +cell Number of dimensions.

    +row
-        +cell #[code width]
-        +cell Number of dimensions.
+        +cell #[code data]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+        +cell The vector data.

    +row("foot")
        +cell returns
@ -208,7 +210,7 @@ p
    +row("foot")
        +cell returns
        +cell tuple
-        +cell #[code (rows, dims)] pairs.
+        +cell A #[code (rows, dims)] pair.

 +h(2, "from_glove") Vectors.from_glove
    +tag method
@ -238,11 +240,16 @@ p Save the current state to a directory.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
-        +cell unicode or #[code Path]
+        +cell unicode / #[code Path]
        +cell
            |  A path to a directory, which will be created if it doesn't exist.
            |  Paths may be either strings or #[code Path]-like objects.

+    +row
+        +cell #[code **exclude]
+        +cell -
+        +cell Named attributes to prevent from being saved.
+
 +h(2, "from_disk") Vectors.from_disk
    +tag method

@ -255,7 +262,7 @@ p Loads state from a directory. Modifies the object in place and returns it.
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
-        +cell unicode or #[code Path]
+        +cell unicode / #[code Path]
        +cell
            |  A path to a directory. Paths may be either strings or
            |  #[code Path]-like objects.
@ -297,7 +304,7 @@ p Load state from a binary string.

 +table(["Name", "Type", "Description"])
    +row
-        +cell #[code bytes_data]
+        +cell #[code data]
        +cell bytes
        +cell The data to load from.