Tidy up Vectors and docs

This commit is contained in:
ines 2017-10-27 19:45:19 +02:00
parent 7946464742
commit 5167a0cce2
3 changed files with 151 additions and 102 deletions

View File

@ -1,5 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from libc.stdint cimport int32_t, uint64_t
import numpy
from collections import OrderedDict
import msgpack
@ -9,23 +10,20 @@ cimport numpy as np
from thinc.neural.util import get_array_module
from thinc.neural._classes.model import Model
from .typedefs cimport attr_t
from .strings cimport StringStore
from . import util
from .compat import basestring_, path2str
from . import util
cdef class Vectors:
'''Store, save and load word vectors.
"""Store, save and load word vectors.
Vectors data is kept in the vectors.data attribute, which should be an
instance of numpy.ndarray (for CPU vectors)
or cupy.ndarray (for GPU vectors).
vectors.key2row is a dictionary mapping word hashes to rows
in the vectors.data table. The array `vectors.keys` keeps
the keys in order, such that keys[vectors.key2row[key]] == key.
'''
instance of numpy.ndarray (for CPU vectors) or cupy.ndarray
(for GPU vectors). `vectors.key2row` is a dictionary mapping word hashes to
rows in the vectors.data table. The array `vectors.keys` keeps the keys in
order, such that `keys[vectors.key2row[key]] == key`.
"""
cdef public object data
cdef readonly StringStore strings
cdef public object key2row
@ -33,6 +31,16 @@ cdef class Vectors:
cdef public int i
def __init__(self, strings, width=0, data=None):
"""Create a new vector store. To keep the vector table empty, pass
`width=0`. You can also create the vector table and add vectors one by
one, or set the vector values directly on initialisation.
strings (StringStore or list): List of strings or StringStore that maps
strings to hash values, and vice versa.
width (int): Number of dimensions.
data (numpy.ndarray): The vector data.
RETURNS (Vectors): The newly created object.
"""
if isinstance(strings, StringStore):
self.strings = strings
else:
@ -55,11 +63,13 @@ cdef class Vectors:
return (Vectors, (self.strings, self.data))
def __getitem__(self, key):
'''Get a vector by key. If key is a string, it is hashed
to an integer ID using the vectors.strings table.
"""Get a vector by key. If key is a string, it is hashed to an integer
ID using the vectors.strings table. If the integer key is not found in
the table, a KeyError is raised.
If the integer key is not found in the table, a KeyError is raised.
'''
key (unicode / int): The key to get the vector for.
RETURNS (numpy.ndarray): The vector for the key.
"""
if isinstance(key, basestring):
key = self.strings[key]
i = self.key2row[key]
@ -69,30 +79,47 @@ cdef class Vectors:
return self.data[i]
def __setitem__(self, key, vector):
'''Set a vector for the given key. If key is a string, it is hashed
"""Set a vector for the given key. If key is a string, it is hashed
to an integer ID using the vectors.strings table.
'''
key (unicode / int): The key to set the vector for.
vector (numpy.ndarray): The vector to set.
"""
if isinstance(key, basestring):
key = self.strings.add(key)
i = self.key2row[key]
self.data[i] = vector
def __iter__(self):
'''Yield vectors from the table.'''
"""Yield vectors from the table.
YIELDS (numpy.ndarray): A vector.
"""
yield from self.data
def __len__(self):
'''Return the number of vectors that have been assigned.'''
"""Return the number of vectors that have been assigned.
RETURNS (int): The number of vectors in the data.
"""
return self.i
def __contains__(self, key):
'''Check whether a key has a vector entry in the table.'''
"""Check whether a key has a vector entry in the table.
key (unicode / int): The key to check.
RETURNS (bool): Whether the key has a vector entry.
"""
if isinstance(key, basestring_):
key = self.strings[key]
return key in self.key2row
def add(self, key, vector=None):
'''Add a key to the table, optionally setting a vector value as well.'''
"""Add a key to the table, optionally setting a vector value as well.
key (unicode / int): The key to add.
vector (numpy.ndarray): An optional vector to add.
"""
if isinstance(key, basestring_):
key = self.strings.add(key)
if key not in self.key2row:
@ -110,24 +137,36 @@ cdef class Vectors:
return i
def items(self):
'''Iterate over (string key, vector) pairs, in order.'''
"""Iterate over `(string key, vector)` pairs, in order.
YIELDS (tuple): A key/vector pair.
"""
for i, key in enumerate(self.keys):
string = self.strings[key]
yield string, self.data[i]
@property
def shape(self):
"""Get `(rows, dims)` tuples of number of rows and number of dimensions
in the vector table.
RETURNS (tuple): A `(rows, dims)` pair.
"""
return self.data.shape
def most_similar(self, key):
# TODO: implement
raise NotImplementedError
def from_glove(self, path):
'''Load GloVe vectors from a directory. Assumes binary format,
"""Load GloVe vectors from a directory. Assumes binary format,
that the vocab is in a vocab.txt, and that vectors are named
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
By default GloVe outputs 64-bit vectors.'''
By default GloVe outputs 64-bit vectors.
path (unicode / Path): The path to load the GloVe vectors from.
"""
path = util.ensure_path(path)
for name in path.iterdir():
if name.parts[-1].startswith('vectors'):
@ -150,9 +189,15 @@ cdef class Vectors:
self.data
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
path (unicode / Path): A path to a directory, which will be created if
it doesn't exists. Either a string or a Path-like object.
"""
xp = get_array_module(self.data)
if xp is numpy:
save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False)
save_array = lambda arr, file_: xp.save(file_, arr,
allow_pickle=False)
else:
save_array = lambda arr, file_: xp.save(file_, arr)
serializers = OrderedDict((
@ -162,6 +207,12 @@ cdef class Vectors:
return util.to_disk(path, serializers, exclude)
def from_disk(self, path, **exclude):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode / Path): Directory path, string or Path-like object.
RETURNS (Vectors): The modified object.
"""
def load_keys(path):
if path.exists():
self.keys = numpy.load(path2str(path))
@ -182,6 +233,11 @@ cdef class Vectors:
return self
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Vectors` object.
"""
def serialize_weights():
if hasattr(self.data, 'to_bytes'):
return self.data.to_bytes()
@ -194,6 +250,12 @@ cdef class Vectors:
return util.to_bytes(serializers, exclude)
def from_bytes(self, data, **exclude):
"""Load state from a binary string.
data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Vectors): The `Vectors` object.
"""
def deserialize_weights(b):
if hasattr(self.data, 'from_bytes'):
self.data.from_bytes()

View File

@ -1,32 +1,23 @@
# coding: utf8
from __future__ import unicode_literals
import bz2
import ujson
import re
import numpy
import dill
from libc.string cimport memset, memcpy
from libc.stdint cimport int32_t
from libc.math cimport sqrt
from cymem.cymem cimport Address
from collections import OrderedDict
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme
from .strings cimport hash_string
from .typedefs cimport attr_t
from .tokens.token cimport Token
from .attrs cimport PROB, LANG
from .attrs cimport PROB, LANG, ORTH, TAG
from .structs cimport SerializedLexemeC
from .compat import copy_reg, pickle, basestring_
from .compat import copy_reg, basestring_
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from .vectors import Vectors
from . import util
from . import attrs
from . import symbols
from ._ml import link_vectors_to_models
@ -36,23 +27,22 @@ cdef class Vocab:
C-data that is shared between `Doc` objects.
"""
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
strings=tuple(), **deprecated_kwargs):
strings=tuple(), **deprecated_kwargs):
"""Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to functions
to compute them. Defaults to `None`.
tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained
lex_attr_getters (dict): A dictionary mapping attribute IDs to
functions to compute them. Defaults to `None`.
tag_map (dict): Dictionary mapping fine-grained tags to coarse-grained
parts-of-speech, and optionally morphological attributes.
lemmatizer (object): A lemmatizer. Defaults to `None`.
strings (StringStore): StringStore that maps strings to integers, and
vice versa.
RETURNS (Vocab): The newly constructed vocab object.
RETURNS (Vocab): The newly constructed object.
"""
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer({}, {}, {})
self.mem = Pool()
self._by_hash = PreshMap()
self._by_orth = PreshMap()
@ -84,19 +74,20 @@ cdef class Vocab:
The flag_getter function will be called over the words currently in the
vocab, and then applied to new words as they occur. You'll then be able
to access the flag value on each token, using token.check_flag(flag_id).
to access the flag value on each token using token.check_flag(flag_id).
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
`Token.check_flag`.
flag_getter (callable): A function `f(unicode) -> bool`, to get the flag
value.
flag_getter (callable): A function `f(unicode) -> bool`, to get the
flag value.
flag_id (int): An integer between 1 and 63 (inclusive), specifying
the bit at which the flag will be stored. If -1, the lowest
available bit will be chosen.
RETURNS (int): The integer ID by which the flag value can be checked.
EXAMPLE:
>>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy'])
>>> my_product_getter = lambda text: text in ['spaCy', 'dislaCy']
>>> MY_PRODUCT = nlp.vocab.add_flag(my_product_getter)
>>> doc = nlp(u'I like spaCy')
>>> assert doc[2].check_flag(MY_PRODUCT) == True
"""
@ -107,9 +98,10 @@ cdef class Vocab:
break
else:
raise ValueError(
"Cannot find empty bit for new lexical flag. All bits between "
"0 and 63 are occupied. You can replace one by specifying the "
"flag_id explicitly, e.g. nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA")
"Cannot find empty bit for new lexical flag. All bits "
"between 0 and 63 are occupied. You can replace one by "
"specifying the flag_id explicitly, e.g. "
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
elif flag_id >= 64 or flag_id < 1:
raise ValueError(
"Invalid value for flag_id: %d. Flag IDs must be between "
@ -120,9 +112,9 @@ cdef class Vocab:
return flag_id
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
"""
if string == u'':
return &EMPTY_LEXEME
@ -139,9 +131,9 @@ cdef class Vocab:
return self._new_lexeme(mem, string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.
"""Get a pointer to a `LexemeC` from the lexicon, creating a new
`Lexeme` if necessary using memory acquired from the given pool. If the
pool is the lexicon's own memory, the lexeme is saved in the lexicon.
"""
if orth == 0:
return &EMPTY_LEXEME
@ -203,8 +195,8 @@ cdef class Vocab:
for orth, addr in self._by_orth.items():
yield Lexeme(self, orth)
def __getitem__(self, id_or_string):
"""Retrieve a lexeme, given an int ID or a unicode string. If a
def __getitem__(self, id_or_string):
"""Retrieve a lexeme, given an int ID or a unicode string. If a
previously unseen unicode string is given, a new lexeme is created and
stored.
@ -229,13 +221,14 @@ cdef class Vocab:
cdef int i
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
props = intify_attrs(props, strings_map=self.strings, _do_deprecated=True)
props = intify_attrs(props, strings_map=self.strings,
_do_deprecated=True)
token = &tokens[i]
# Set the special tokens up to have arbitrary attributes
lex = <LexemeC*>self.get_by_orth(self.mem, props[attrs.ORTH])
lex = <LexemeC*>self.get_by_orth(self.mem, props[ORTH])
token.lex = lex
if attrs.TAG in props:
self.morphology.assign_tag(token, props[attrs.TAG])
if TAG in props:
self.morphology.assign_tag(token, props[TAG])
for attr_id, value in props.items():
Token.set_struct_attr(token, attr_id, value)
Lexeme.set_struct_attr(lex, attr_id, value)
@ -254,16 +247,13 @@ cdef class Vocab:
self.vectors = Vectors(self.strings, width=new_dim)
def get_vector(self, orth):
"""Retrieve a vector for a word in the vocabulary.
"""Retrieve a vector for a word in the vocabulary. Words can be looked
up by string or int ID. If no vectors data is loaded, ValueError is
raised.
Words can be looked up by string or int ID.
RETURNS:
A word vector. Size and shape determined by the
vocab.vectors instance. Usually, a numpy ndarray
of shape (300,) and dtype float32.
RAISES: If no vectors data is loaded, ValueError is raised.
RETURNS (numpy.ndarray): A word vector. Size
and shape determined by the `vocab.vectors` instance. Usually, a
numpy ndarray of shape (300,) and dtype float32.
"""
if isinstance(orth, basestring_):
orth = self.strings.add(orth)
@ -273,21 +263,16 @@ cdef class Vocab:
return numpy.zeros((self.vectors_length,), dtype='f')
def set_vector(self, orth, vector):
"""Set a vector for a word in the vocabulary.
Words can be referenced by string or int ID.
RETURNS:
None
"""Set a vector for a word in the vocabulary. Words can be referenced
by string or int ID.
"""
if not isinstance(orth, basestring_):
orth = self.strings[orth]
self.vectors.add(orth, vector=vector)
def has_vector(self, orth):
"""Check whether a word has a vector. Returns False if no
vectors have been loaded. Words can be looked up by string
or int ID."""
"""Check whether a word has a vector. Returns False if no vectors have
been loaded. Words can be looked up by string or int ID."""
if isinstance(orth, basestring_):
orth = self.strings.add(orth)
return orth in self.vectors
@ -296,7 +281,7 @@ cdef class Vocab:
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
it doesn't exist. Paths may be either strings or Path-like objects.
"""
path = util.ensure_path(path)
if not path.exists():
@ -421,16 +406,13 @@ def pickle_vocab(vocab):
length = vocab.length
data_dir = vocab.data_dir
lex_attr_getters = dill.dumps(vocab.lex_attr_getters)
lexemes_data = vocab.lexemes_to_bytes()
return (unpickle_vocab,
(sstore, morph, data_dir, lex_attr_getters,
lexemes_data, length))
(sstore, morph, data_dir, lex_attr_getters, lexemes_data, length))
def unpickle_vocab(sstore, morphology, data_dir,
lex_attr_getters, bytes lexemes_data, int length):
lex_attr_getters, bytes lexemes_data, int length):
cdef Vocab vocab = Vocab()
vocab.length = length
vocab.strings = sstore
@ -450,12 +432,10 @@ class LookupError(Exception):
@classmethod
def mismatched_strings(cls, id_, id_string, original_string):
return cls(
"Error fetching a Lexeme from the Vocab. When looking up a string, "
"the lexeme returned had an orth ID that did not match the query string. "
"This means that the cached lexeme structs are mismatched to the "
"string encoding table. The mismatched:\n"
"Query string: {query}\n"
"Orth cached: {orth_str}\n"
"ID of orth: {orth_id}".format(
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
)
"Error fetching a Lexeme from the Vocab. When looking up a "
"string, the lexeme returned had an orth ID that did not match "
"the query string. This means that the cached lexeme structs are "
"mismatched to the string encoding table. The mismatched:\n"
"Query string: {}\n"
"Orth cached: {}\n"
"Orth ID: {}".format(repr(original_string), repr(id_string), id_))

View File

@ -36,12 +36,14 @@ p
| that maps strings to hash values, and vice versa.
+row
+cell #[code data]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell #[code width]
+cell int
+cell Number of dimensions.
+row
+cell #[code width]
+cell Number of dimensions.
+cell #[code data]
+cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+cell The vector data.
+row("foot")
+cell returns
@ -208,7 +210,7 @@ p
+row("foot")
+cell returns
+cell tuple
+cell #[code (rows, dims)] pairs.
+cell A #[code (rows, dims)] pair.
+h(2, "from_glove") Vectors.from_glove
+tag method
@ -238,11 +240,16 @@ p Save the current state to a directory.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell unicode / #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being saved.
+h(2, "from_disk") Vectors.from_disk
+tag method
@ -255,7 +262,7 @@ p Loads state from a directory. Modifies the object in place and returns it.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell unicode / #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
@ -297,7 +304,7 @@ p Load state from a binary string.
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell #[code data]
+cell bytes
+cell The data to load from.