Tidy up Vectors and docs

This commit is contained in:
ines 2017-10-27 19:45:19 +02:00
parent 7946464742
commit 5167a0cce2
3 changed files with 151 additions and 102 deletions

View File

@ -1,5 +1,6 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.stdint cimport int32_t, uint64_t
import numpy import numpy
from collections import OrderedDict from collections import OrderedDict
import msgpack import msgpack
@ -9,23 +10,20 @@ cimport numpy as np
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
from .typedefs cimport attr_t
from .strings cimport StringStore from .strings cimport StringStore
from . import util
from .compat import basestring_, path2str from .compat import basestring_, path2str
from . import util
cdef class Vectors: cdef class Vectors:
'''Store, save and load word vectors. """Store, save and load word vectors.
Vectors data is kept in the vectors.data attribute, which should be an Vectors data is kept in the vectors.data attribute, which should be an
instance of numpy.ndarray (for CPU vectors) instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
or cupy.ndarray (for GPU vectors). (for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
rows in the vectors.data table. The array `vectors.keys` keeps the keys in
vectors.key2row is a dictionary mapping word hashes to rows order, such that `keys[vectors.key2row[key]] == key`.
in the vectors.data table. The array `vectors.keys` keeps """
the keys in order, such that keys[vectors.key2row[key]] == key.
'''
cdef public object data cdef public object data
cdef readonly StringStore strings cdef readonly StringStore strings
cdef public object key2row cdef public object key2row
@ -33,6 +31,16 @@ cdef class Vectors:
cdef public int i cdef public int i
def __init__(self, strings, width=0, data=None): def __init__(self, strings, width=0, data=None):
"""Create a new vector store. To keep the vector table empty, pass
`width=0`. You can also create the vector table and add vectors one by
one, or set the vector values directly on initialisation.
strings (StringStore or list): List of strings or StringStore that maps
strings to hash values, and vice versa.
width (int): Number of dimensions.
data (numpy.ndarray): The vector data.
RETURNS (Vectors): The newly created object.
"""
if isinstance(strings, StringStore): if isinstance(strings, StringStore):
self.strings = strings self.strings = strings
else: else:
@ -55,11 +63,13 @@ cdef class Vectors:
return (Vectors, (self.strings, self.data)) return (Vectors, (self.strings, self.data))
def __getitem__(self, key): def __getitem__(self, key):
'''Get a vector by key. If key is a string, it is hashed """Get a vector by key. If key is a string, it is hashed to an integer
to an integer ID using the vectors.strings table. ID using the vectors.strings table. If the integer key is not found in
the table, a KeyError is raised.
If the integer key is not found in the table, a KeyError is raised. key (unicode / int): The key to get the vector for.
''' RETURNS (numpy.ndarray): The vector for the key.
"""
if isinstance(key, basestring): if isinstance(key, basestring):
key = self.strings[key] key = self.strings[key]
i = self.key2row[key] i = self.key2row[key]
@ -69,30 +79,47 @@ cdef class Vectors:
return self.data[i] return self.data[i]
def __setitem__(self, key, vector): def __setitem__(self, key, vector):
'''Set a vector for the given key. If key is a string, it is hashed """Set a vector for the given key. If key is a string, it is hashed
to an integer ID using the vectors.strings table. to an integer ID using the vectors.strings table.
'''
key (unicode / int): The key to set the vector for.
vector (numpy.ndarray): The vector to set.
"""
if isinstance(key, basestring): if isinstance(key, basestring):
key = self.strings.add(key) key = self.strings.add(key)
i = self.key2row[key] i = self.key2row[key]
self.data[i] = vector self.data[i] = vector
def __iter__(self): def __iter__(self):
'''Yield vectors from the table.''' """Yield vectors from the table.
YIELDS (numpy.ndarray): A vector.
"""
yield from self.data yield from self.data
def __len__(self): def __len__(self):
'''Return the number of vectors that have been assigned.''' """Return the number of vectors that have been assigned.
RETURNS (int): The number of vectors in the data.
"""
return self.i return self.i
def __contains__(self, key): def __contains__(self, key):
'''Check whether a key has a vector entry in the table.''' """Check whether a key has a vector entry in the table.
key (unicode / int): The key to check.
RETURNS (bool): Whether the key has a vector entry.
"""
if isinstance(key, basestring_): if isinstance(key, basestring_):
key = self.strings[key] key = self.strings[key]
return key in self.key2row return key in self.key2row
def add(self, key, vector=None): def add(self, key, vector=None):
'''Add a key to the table, optionally setting a vector value as well.''' """Add a key to the table, optionally setting a vector value as well.
key (unicode / int): The key to add.
vector (numpy.ndarray): An optional vector to add.
"""
if isinstance(key, basestring_): if isinstance(key, basestring_):
key = self.strings.add(key) key = self.strings.add(key)
if key not in self.key2row: if key not in self.key2row:
@ -110,24 +137,36 @@ cdef class Vectors:
return i return i
def items(self): def items(self):
'''Iterate over (string key, vector) pairs, in order.''' """Iterate over `(string key, vector)` pairs, in order.
YIELDS (tuple): A key/vector pair.
"""
for i, key in enumerate(self.keys): for i, key in enumerate(self.keys):
string = self.strings[key] string = self.strings[key]
yield string, self.data[i] yield string, self.data[i]
@property @property
def shape(self): def shape(self):
"""Get `(rows, dims)` tuples of number of rows and number of dimensions
in the vector table.
RETURNS (tuple): A `(rows, dims)` pair.
"""
return self.data.shape return self.data.shape
def most_similar(self, key): def most_similar(self, key):
# TODO: implement
raise NotImplementedError raise NotImplementedError
def from_glove(self, path): def from_glove(self, path):
'''Load GloVe vectors from a directory. Assumes binary format, """Load GloVe vectors from a directory. Assumes binary format,
that the vocab is in a vocab.txt, and that vectors are named that the vocab is in a vocab.txt, and that vectors are named
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32 vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc. vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
By default GloVe outputs 64-bit vectors.''' By default GloVe outputs 64-bit vectors.
path (unicode / Path): The path to load the GloVe vectors from.
"""
path = util.ensure_path(path) path = util.ensure_path(path)
for name in path.iterdir(): for name in path.iterdir():
if name.parts[-1].startswith('vectors'): if name.parts[-1].startswith('vectors'):
@ -150,9 +189,15 @@ cdef class Vectors:
self.data self.data
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
"""Save the current state to a directory.
path (unicode / Path): A path to a directory, which will be created if
it doesn't exists. Either a string or a Path-like object.
"""
xp = get_array_module(self.data) xp = get_array_module(self.data)
if xp is numpy: if xp is numpy:
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) save_array = lambda arr, file_: xp.save(file_, arr,
allow_pickle=False)
else: else:
save_array = lambda arr, file_: xp.save(file_, arr) save_array = lambda arr, file_: xp.save(file_, arr)
serializers = OrderedDict(( serializers = OrderedDict((
@ -162,6 +207,12 @@ cdef class Vectors:
return util.to_disk(path, serializers, exclude) return util.to_disk(path, serializers, exclude)
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode / Path): Directory path, string or Path-like object.
RETURNS (Vectors): The modified object.
"""
def load_keys(path): def load_keys(path):
if path.exists(): if path.exists():
self.keys = numpy.load(path2str(path)) self.keys = numpy.load(path2str(path))
@ -182,6 +233,11 @@ cdef class Vectors:
return self return self
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Vectors` object.
"""
def serialize_weights(): def serialize_weights():
if hasattr(self.data, 'to_bytes'): if hasattr(self.data, 'to_bytes'):
return self.data.to_bytes() return self.data.to_bytes()
@ -194,6 +250,12 @@ cdef class Vectors:
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes(self, data, **exclude): def from_bytes(self, data, **exclude):
"""Load state from a binary string.
data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Vectors): The `Vectors` object.
"""
def deserialize_weights(b): def deserialize_weights(b):
if hasattr(self.data, 'from_bytes'): if hasattr(self.data, 'from_bytes'):
self.data.from_bytes() self.data.from_bytes()

View File

@ -1,32 +1,23 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import bz2
import ujson
import re
import numpy import numpy
import dill import dill
from libc.string cimport memset, memcpy
from libc.stdint cimport int32_t
from libc.math cimport sqrt
from cymem.cymem cimport Address
from collections import OrderedDict from collections import OrderedDict
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport hash_string from .strings cimport hash_string
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .tokens.token cimport Token from .tokens.token cimport Token
from .attrs cimport PROB, LANG from .attrs cimport PROB, LANG, ORTH, TAG
from .structs cimport SerializedLexemeC from .structs cimport SerializedLexemeC
from .compat import copy_reg, pickle, basestring_ from .compat import copy_reg, basestring_
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .attrs import intify_attrs from .attrs import intify_attrs
from .vectors import Vectors from .vectors import Vectors
from . import util from . import util
from . import attrs
from . import symbols
from ._ml import link_vectors_to_models from ._ml import link_vectors_to_models
@ -39,20 +30,19 @@ cdef class Vocab:
strings=tuple(), **deprecated_kwargs): strings=tuple(), **deprecated_kwargs):
"""Create the vocabulary. """Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to functions lex_attr_getters (dict): A dictionary mapping attribute IDs to
to compute them. Defaults to `None`. functions to compute them. Defaults to `None`.
tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained
parts-of-speech, and optionally morphological attributes. parts-of-speech, and optionally morphological attributes.
lemmatizer (object): A lemmatizer. Defaults to `None`. lemmatizer (object): A lemmatizer. Defaults to `None`.
strings (StringStore): StringStore that maps strings to integers, and strings (StringStore): StringStore that maps strings to integers, and
vice versa. vice versa.
RETURNS (Vocab): The newly constructed vocab object. RETURNS (Vocab): The newly constructed object.
""" """
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {} tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False): if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer({}, {}, {}) lemmatizer = Lemmatizer({}, {}, {})
self.mem = Pool() self.mem = Pool()
self._by_hash = PreshMap() self._by_hash = PreshMap()
self._by_orth = PreshMap() self._by_orth = PreshMap()
@ -84,19 +74,20 @@ cdef class Vocab:
The flag_getter function will be called over the words currently in the The flag_getter function will be called over the words currently in the
vocab, and then applied to new words as they occur. You'll then be able vocab, and then applied to new words as they occur. You'll then be able
to access the flag value on each token, using token.check_flag(flag_id). to access the flag value on each token using token.check_flag(flag_id).
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`, See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
`Token.check_flag`. `Token.check_flag`.
flag_getter (callable): A function `f(unicode) -> bool`, to get the flag flag_getter (callable): A function `f(unicode) -> bool`, to get the
value. flag value.
flag_id (int): An integer between 1 and 63 (inclusive), specifying flag_id (int): An integer between 1 and 63 (inclusive), specifying
the bit at which the flag will be stored. If -1, the lowest the bit at which the flag will be stored. If -1, the lowest
available bit will be chosen. available bit will be chosen.
RETURNS (int): The integer ID by which the flag value can be checked. RETURNS (int): The integer ID by which the flag value can be checked.
EXAMPLE: EXAMPLE:
>>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy']) >>> my_product_getter = lambda text: text in ['spaCy', 'dislaCy']
>>> MY_PRODUCT = nlp.vocab.add_flag(my_product_getter)
>>> doc = nlp(u'I like spaCy') >>> doc = nlp(u'I like spaCy')
>>> assert doc[2].check_flag(MY_PRODUCT) == True >>> assert doc[2].check_flag(MY_PRODUCT) == True
""" """
@ -107,9 +98,10 @@ cdef class Vocab:
break break
else: else:
raise ValueError( raise ValueError(
"Cannot find empty bit for new lexical flag. All bits between " "Cannot find empty bit for new lexical flag. All bits "
"0 and 63 are occupied. You can replace one by specifying the " "between 0 and 63 are occupied. You can replace one by "
"flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA") "specifying the flag_id explicitly, e.g. "
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
elif flag_id >= 64 or flag_id < 1: elif flag_id >= 64 or flag_id < 1:
raise ValueError( raise ValueError(
"Invalid value for flag_id: %d. Flag IDs must be between " "Invalid value for flag_id: %d. Flag IDs must be between "
@ -120,9 +112,9 @@ cdef class Vocab:
return flag_id return flag_id
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` """Get a pointer to a `LexemeC` from the lexicon, creating a new
if necessary, using memory acquired from the given pool. If the pool `Lexeme` if necessary using memory acquired from the given pool. If the
is the lexicon's own memory, the lexeme is saved in the lexicon. pool is the lexicon's own memory, the lexeme is saved in the lexicon.
""" """
if string == u'': if string == u'':
return &EMPTY_LEXEME return &EMPTY_LEXEME
@ -139,9 +131,9 @@ cdef class Vocab:
return self._new_lexeme(mem, string) return self._new_lexeme(mem, string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme` """Get a pointer to a `LexemeC` from the lexicon, creating a new
if necessary, using memory acquired from the given pool. If the pool `Lexeme` if necessary using memory acquired from the given pool. If the
is the lexicon's own memory, the lexeme is saved in the lexicon. pool is the lexicon's own memory, the lexeme is saved in the lexicon.
""" """
if orth == 0: if orth == 0:
return &EMPTY_LEXEME return &EMPTY_LEXEME
@ -229,13 +221,14 @@ cdef class Vocab:
cdef int i cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings): for i, props in enumerate(substrings):
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True) props = intify_attrs(props, strings_map=self.strings,
_do_deprecated=True)
token = &tokens[i] token = &tokens[i]
# Set the special tokens up to have arbitrary attributes # Set the special tokens up to have arbitrary attributes
lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH]) lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
token.lex = lex token.lex = lex
if attrs.TAG in props: if TAG in props:
self.morphology.assign_tag(token, props[attrs.TAG]) self.morphology.assign_tag(token, props[TAG])
for attr_id, value in props.items(): for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value) Token.set_struct_attr(token, attr_id, value)
Lexeme.set_struct_attr(lex, attr_id, value) Lexeme.set_struct_attr(lex, attr_id, value)
@ -254,16 +247,13 @@ cdef class Vocab:
self.vectors = Vectors(self.strings, width=new_dim) self.vectors = Vectors(self.strings, width=new_dim)
def get_vector(self, orth): def get_vector(self, orth):
"""Retrieve a vector for a word in the vocabulary. """Retrieve a vector for a word in the vocabulary. Words can be looked
up by string or int ID. If no vectors data is loaded, ValueError is
raised.
Words can be looked up by string or int ID. RETURNS (numpy.ndarray): A word vector. Size
and shape determined by the `vocab.vectors` instance. Usually, a
RETURNS: numpy ndarray of shape (300,) and dtype float32.
A word vector. Size and shape determined by the
vocab.vectors instance. Usually, a numpy ndarray
of shape (300,) and dtype float32.
RAISES: If no vectors data is loaded, ValueError is raised.
""" """
if isinstance(orth, basestring_): if isinstance(orth, basestring_):
orth = self.strings.add(orth) orth = self.strings.add(orth)
@ -273,21 +263,16 @@ cdef class Vocab:
return numpy.zeros((self.vectors_length,), dtype='f') return numpy.zeros((self.vectors_length,), dtype='f')
def set_vector(self, orth, vector): def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary. """Set a vector for a word in the vocabulary. Words can be referenced
by string or int ID.
Words can be referenced by string or int ID.
RETURNS:
None
""" """
if not isinstance(orth, basestring_): if not isinstance(orth, basestring_):
orth = self.strings[orth] orth = self.strings[orth]
self.vectors.add(orth, vector=vector) self.vectors.add(orth, vector=vector)
def has_vector(self, orth): def has_vector(self, orth):
"""Check whether a word has a vector. Returns False if no """Check whether a word has a vector. Returns False if no vectors have
vectors have been loaded. Words can be looked up by string been loaded. Words can be looked up by string or int ID."""
or int ID."""
if isinstance(orth, basestring_): if isinstance(orth, basestring_):
orth = self.strings.add(orth) orth = self.strings.add(orth)
return orth in self.vectors return orth in self.vectors
@ -296,7 +281,7 @@ cdef class Vocab:
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects. it doesn't exist. Paths may be either strings or Path-like objects.
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
if not path.exists(): if not path.exists():
@ -421,12 +406,9 @@ def pickle_vocab(vocab):
length = vocab.length length = vocab.length
data_dir = vocab.data_dir data_dir = vocab.data_dir
lex_attr_getters = dill.dumps(vocab.lex_attr_getters) lex_attr_getters = dill.dumps(vocab.lex_attr_getters)
lexemes_data = vocab.lexemes_to_bytes() lexemes_data = vocab.lexemes_to_bytes()
return (unpickle_vocab, return (unpickle_vocab,
(sstore, morph, data_dir, lex_attr_getters, (sstore, morph, data_dir, lex_attr_getters, lexemes_data, length))
lexemes_data, length))
def unpickle_vocab(sstore, morphology, data_dir, def unpickle_vocab(sstore, morphology, data_dir,
@ -450,12 +432,10 @@ class LookupError(Exception):
@classmethod @classmethod
def mismatched_strings(cls, id_, id_string, original_string): def mismatched_strings(cls, id_, id_string, original_string):
return cls( return cls(
"Error fetching a Lexeme from the Vocab. When looking up a string, " "Error fetching a Lexeme from the Vocab. When looking up a "
"the lexeme returned had an orth ID that did not match the query string. " "string, the lexeme returned had an orth ID that did not match "
"This means that the cached lexeme structs are mismatched to the " "the query string. This means that the cached lexeme structs are "
"string encoding table. The mismatched:\n" "mismatched to the string encoding table. The mismatched:\n"
"Query string: {query}\n" "Query string: {}\n"
"Orth cached: {orth_str}\n" "Orth cached: {}\n"
"ID of orth: {orth_id}".format( "Orth ID: {}".format(repr(original_string), repr(id_string), id_))
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
)

View File

@ -36,12 +36,14 @@ p
| that maps strings to hash values, and vice versa. | that maps strings to hash values, and vice versa.
+row +row
+cell #[code data] +cell #[code width]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell int
+cell Number of dimensions.
+row +row
+cell #[code width] +cell #[code data]
+cell Number of dimensions. +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell The vector data.
+row("foot") +row("foot")
+cell returns +cell returns
@ -208,7 +210,7 @@ p
+row("foot") +row("foot")
+cell returns +cell returns
+cell tuple +cell tuple
+cell #[code (rows, dims)] pairs. +cell A #[code (rows, dims)] pair.
+h(2, "from_glove") Vectors.from_glove +h(2, "from_glove") Vectors.from_glove
+tag method +tag method
@ -238,11 +240,16 @@ p Save the current state to a directory.
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code path] +cell #[code path]
+cell unicode or #[code Path] +cell unicode / #[code Path]
+cell +cell
| A path to a directory, which will be created if it doesn't exist. | A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects. | Paths may be either strings or #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being saved.
+h(2, "from_disk") Vectors.from_disk +h(2, "from_disk") Vectors.from_disk
+tag method +tag method
@ -255,7 +262,7 @@ p Loads state from a directory. Modifies the object in place and returns it.
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code path] +cell #[code path]
+cell unicode or #[code Path] +cell unicode / #[code Path]
+cell +cell
| A path to a directory. Paths may be either strings or | A path to a directory. Paths may be either strings or
| #[code Path]-like objects. | #[code Path]-like objects.
@ -297,7 +304,7 @@ p Load state from a binary string.
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code bytes_data] +cell #[code data]
+cell bytes +cell bytes
+cell The data to load from. +cell The data to load from.