diff --git a/setup.py b/setup.py index 0a3384ed5..6a22f4076 100755 --- a/setup.py +++ b/setup.py @@ -28,7 +28,9 @@ MOD_NAMES = [ 'spacy.pipeline', 'spacy.syntax.stateclass', 'spacy.syntax._state', + 'spacy.syntax._beam_utils', 'spacy.tokenizer', + 'spacy._cfile', 'spacy.syntax.parser', 'spacy.syntax.nn_parser', 'spacy.syntax.beam_parser', diff --git a/spacy/_cfile.pxd b/spacy/_cfile.pxd new file mode 100644 index 000000000..cb0077587 --- /dev/null +++ b/spacy/_cfile.pxd @@ -0,0 +1,26 @@ +from libc.stdio cimport fopen, fclose, fread, fwrite, FILE +from cymem.cymem cimport Pool + +cdef class CFile: + cdef FILE* fp + cdef bint is_open + cdef Pool mem + cdef int size # For compatibility with subclass + cdef int _capacity # For compatibility with subclass + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 + + cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * + + + +cdef class StringCFile(CFile): + cdef unsigned char* data + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 + + cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * diff --git a/spacy/_cfile.pyx b/spacy/_cfile.pyx new file mode 100644 index 000000000..ceebe2e59 --- /dev/null +++ b/spacy/_cfile.pyx @@ -0,0 +1,88 @@ +from libc.stdio cimport fopen, fclose, fread, fwrite, FILE +from libc.string cimport memcpy + + +cdef class CFile: + def __init__(self, loc, mode, on_open_error=None): + if isinstance(mode, unicode): + mode_str = mode.encode('ascii') + else: + mode_str = mode + if hasattr(loc, 'as_posix'): + loc = loc.as_posix() + self.mem = Pool() + cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + self.fp = fopen(bytes_loc, mode_str) + if self.fp == NULL: + if on_open_error is not None: + on_open_error() + else: + raise IOError("Could not open binary file %s" % bytes_loc) + self.is_open = True + + def __dealloc__(self): + if self.is_open: + fclose(self.fp) + + def close(self): + fclose(self.fp) + self.is_open = False + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: + st = fread(dest, elem_size, number, self.fp) + if st != number: + raise IOError + + cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: + st = fwrite(src, elem_size, number, self.fp) + if st != number: + raise IOError + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: + cdef void* dest = mem.alloc(number, elem_size) + self.read_into(dest, number, elem_size) + return dest + + def write_unicode(self, unicode value): + cdef bytes py_bytes = value.encode('utf8') + cdef char* chars = py_bytes + self.write(sizeof(char), len(py_bytes), chars) + + +cdef class StringCFile: + def __init__(self, mode, bytes data=b'', on_open_error=None): + self.mem = Pool() + self.is_open = 'w' in mode + self._capacity = max(len(data), 8) + self.size = len(data) + self.data = self.mem.alloc(1, self._capacity) + for i in range(len(data)): + self.data[i] = data[i] + + def close(self): + self.is_open = False + + def string_data(self): + return (self.data-self.size)[:self.size] + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: + memcpy(dest, self.data, elem_size * number) + self.data += elem_size * number + + cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1: + write_size = number * elem_size + if (self.size + write_size) >= self._capacity: + self._capacity = (self.size + write_size) * 2 + self.data = self.mem.realloc(self.data, self._capacity) + memcpy(&self.data[self.size], src, elem_size * number) + self.size += write_size + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: + cdef void* dest = mem.alloc(number, elem_size) + self.read_into(dest, number, elem_size) + return dest + + def write_unicode(self, unicode value): + cdef bytes py_bytes = value.encode('utf8') + cdef char* chars = py_bytes + self.write(sizeof(char), len(py_bytes), chars) diff --git a/spacy/_ml.py b/spacy/_ml.py index f1ded666e..b3b0d3e46 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -5,10 +5,12 @@ from thinc.neural._classes.hash_embed import HashEmbed from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module import random +import cytoolz from thinc.neural._classes.convolution import ExtractWindow from thinc.neural._classes.static_vectors import StaticVectors -from thinc.neural._classes.batchnorm import BatchNorm +from thinc.neural._classes.batchnorm import BatchNorm as BN +from thinc.neural._classes.layernorm import LayerNorm as LN from thinc.neural._classes.resnet import Residual from thinc.neural import ReLu from thinc.neural._classes.selu import SELU @@ -19,10 +21,12 @@ from thinc.api import FeatureExtracter, with_getitem from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool from thinc.neural._classes.attention import ParametricAttention from thinc.linear.linear import LinearModel -from thinc.api import uniqued, wrap +from thinc.api import uniqued, wrap, flatten_add_lengths + from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP from .tokens.doc import Doc +from . import util import numpy import io @@ -53,6 +57,27 @@ def _logistic(X, drop=0.): return Y, logistic_bwd +@layerize +def add_tuples(X, drop=0.): + """Give inputs of sequence pairs, where each sequence is (vals, length), + sum the values, returning a single sequence. + + If input is: + ((vals1, length), (vals2, length) + Output is: + (vals1+vals2, length) + + vals are a single tensor for the whole batch. + """ + (vals1, length1), (vals2, length2) = X + assert length1 == length2 + + def add_tuples_bwd(dY, sgd=None): + return (dY, dY) + + return (vals1+vals2, length), add_tuples_bwd + + def _zero_init(model): def _zero_init_impl(self, X, y): self.W.fill(0) @@ -61,6 +86,7 @@ def _zero_init(model): model.W.fill(0.) return model + @layerize def _preprocess_doc(docs, drop=0.): keys = [doc.to_array([LOWER]) for doc in docs] @@ -72,7 +98,6 @@ def _preprocess_doc(docs, drop=0.): return (keys, vals, lengths), None - def _init_for_precomputed(W, ops): if (W**2).sum() != 0.: return @@ -80,6 +105,7 @@ def _init_for_precomputed(W, ops): ops.xavier_uniform_init(reshaped) W[:] = reshaped.reshape(W.shape) + @describe.on_data(_set_dimensions_if_needed) @describe.attributes( nI=Dimension("Input size"), @@ -184,25 +210,36 @@ class PrecomputableMaxouts(Model): return Yfp, backward +def drop_layer(layer, factor=2.): + def drop_layer_fwd(X, drop=0.): + drop *= factor + mask = layer.ops.get_dropout_mask((1,), drop) + if mask is None or mask > 0: + return layer.begin_update(X, drop=drop) + else: + return X, lambda dX, sgd=None: dX + return wrap(drop_layer_fwd, layer) + + def Tok2Vec(width, embed_size, preprocess=None): - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] + cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): - norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower') + norm = get_col(cols.index(NORM)) >> HashEmbed(width, embed_size, name='embed_lower') prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix') suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix') shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size//2, name='embed_shape') - embed = (norm | prefix | suffix | shape ) + embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3)) tok2vec = ( with_flatten( asarray(Model.ops, dtype='uint64') - >> embed - >> Maxout(width, width*4, pieces=3) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) - >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)), - pad=4) + >> uniqued(embed, column=5) + >> drop_layer( + Residual( + (ExtractWindow(nW=1) >> BN(Maxout(width, width*3))) + ) + ) ** 4, pad=4 + ) ) if preprocess not in (False, None): tok2vec = preprocess >> tok2vec @@ -297,7 +334,8 @@ def zero_init(model): def doc2feats(cols=None): - cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] + if cols is None: + cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] def forward(docs, drop=0.): feats = [] for doc in docs: @@ -323,6 +361,37 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.): return vectors, backward +def fine_tune(embedding, combine=None): + if combine is not None: + raise NotImplementedError( + "fine_tune currently only supports addition. Set combine=None") + def fine_tune_fwd(docs_tokvecs, drop=0.): + docs, tokvecs = docs_tokvecs + lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') + + vecs, bp_vecs = embedding.begin_update(docs, drop=drop) + flat_tokvecs = embedding.ops.flatten(tokvecs) + flat_vecs = embedding.ops.flatten(vecs) + output = embedding.ops.unflatten( + (model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs), + lengths) + + def fine_tune_bwd(d_output, sgd=None): + bp_vecs(d_output, sgd=sgd) + flat_grad = model.ops.flatten(d_output) + model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum() + model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum() + if sgd is not None: + sgd(model._mem.weights, model._mem.gradient, key=model.id) + return d_output + return output, fine_tune_bwd + model = wrap(fine_tune_fwd, embedding) + model.mix = model._mem.add((model.id, 'mix'), (2,)) + model.mix.fill(1.) + model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix')) + return model + + @layerize def flatten(seqs, drop=0.): if isinstance(seqs[0], numpy.ndarray): @@ -369,6 +438,27 @@ def preprocess_doc(docs, drop=0.): vals = ops.allocate(keys.shape[0]) + 1 return (keys, vals, lengths), None +def getitem(i): + def getitem_fwd(X, drop=0.): + return X[i], None + return layerize(getitem_fwd) + +def build_tagger_model(nr_class, token_vector_width, **cfg): + embed_size = util.env_opt('embed_size', 7500) + with Model.define_operators({'>>': chain, '+': add}): + # Input: (doc, tensor) tuples + private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats()) + + model = ( + fine_tune(private_tok2vec) + >> with_flatten( + Maxout(token_vector_width, token_vector_width) + >> Softmax(nr_class, token_vector_width) + ) + ) + model.nI = None + return model + def build_text_classifier(nr_class, width=64, **cfg): nr_vector = cfg.get('nr_vector', 200) @@ -383,7 +473,7 @@ def build_text_classifier(nr_class, width=64, **cfg): >> _flatten_add_lengths >> with_getitem(0, uniqued( - (embed_lower | embed_prefix | embed_suffix | embed_shape) + (embed_lower | embed_prefix | embed_suffix | embed_shape) >> Maxout(width, width+(width//2)*3)) >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) @@ -404,7 +494,7 @@ def build_text_classifier(nr_class, width=64, **cfg): >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) >> logistic ) - + model.lsuv = False return model diff --git a/spacy/about.py b/spacy/about.py index 9f62c769e..d494f8d31 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy-nightly' -__version__ = '2.0.0a9' +__version__ = '2.0.0a10' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Explosion AI' diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index a0a76e5ec..fef6753e6 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -21,10 +21,10 @@ CONVERTERS = { @plac.annotations( input_file=("input file", "positional", None, str), output_dir=("output directory for converted file", "positional", None, str), - n_sents=("Number of sentences per doc", "option", "n", float), + n_sents=("Number of sentences per doc", "option", "n", int), morphology=("Enable appending morphology to tags", "flag", "m", bool) ) -def convert(cmd, input_file, output_dir, n_sents, morphology): +def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): """ Convert files into JSON format for use with train command and other experiment management functions. diff --git a/spacy/cli/train.py b/spacy/cli/train.py index af028dae5..04aac8319 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -91,15 +91,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, for batch in minibatch(train_docs, size=batch_sizes): docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, - drop=next(dropout_rates), losses=losses) + drop=next(dropout_rates), losses=losses, + update_tensors=True) pbar.update(sum(len(doc) for doc in docs)) with nlp.use_params(optimizer.averages): util.set_env_log(False) epoch_model_path = output_path / ('model%d' % i) nlp.to_disk(epoch_model_path) - with (output_path / ('model%d.pickle' % i)).open('wb') as file_: - dill.dump(nlp, file_, -1) nlp_loaded = lang_class(pipeline=pipeline) nlp_loaded = nlp_loaded.from_disk(epoch_model_path) scorer = nlp_loaded.evaluate( diff --git a/spacy/compat.py b/spacy/compat.py index 4ef24cd8b..e6b7c066b 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -46,19 +46,21 @@ is_osx = sys.platform == 'darwin' if is_python2: + import imp bytes_ = str unicode_ = unicode basestring_ = basestring input_ = raw_input - json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8') + json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8') path2str = lambda path: str(path).decode('utf8') elif is_python3: + import importlib.util bytes_ = bytes unicode_ = str basestring_ = str input_ = input - json_dumps = lambda data: ujson.dumps(data, indent=2) + json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False) path2str = lambda path: str(path) @@ -102,3 +104,12 @@ def normalize_string_keys(old): return new +def import_file(name, loc): + loc = str(loc) + if is_python2: + return imp.load_source(name, loc) + else: + spec = importlib.util.spec_from_file_location(name, str(loc)) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py new file mode 100644 index 000000000..549f71fb5 --- /dev/null +++ b/spacy/lang/da/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.da.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple overvejer at købe et britisk statup for 1 milliard dollar", + "Selvkørende biler flytter forsikringsansvaret over på producenterne", + "San Francisco overvejer at forbyde leverandørrobotter på fortov", + "London er en stor by i Storbritannien" +] diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py new file mode 100644 index 000000000..49ac0e14b --- /dev/null +++ b/spacy/lang/de/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.de.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen", + "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz", + "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz", + "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion", + "San Francisco erwägt Verbot von Lieferrobotern", + "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller", + "Wo bist du?", + "Was ist die Hauptstadt von Deutschland?" +] diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py new file mode 100644 index 000000000..b92d4a65c --- /dev/null +++ b/spacy/lang/en/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.en.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple is looking at buying U.K. startup for $1 billion", + "Autonomous cars shift insurance liability toward manufacturers", + "San Francisco considers banning sidewalk delivery robots", + "London is a big city in the United Kingdom.", + "Where are you?", + "Who is the president of France?", + "What is the capital of the United States?", + "When was Barack Obama born?" +] diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py new file mode 100644 index 000000000..61fe8c9be --- /dev/null +++ b/spacy/lang/es/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.es.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares", + "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes", + "San Francisco analiza prohibir los robots delivery", + "Londres es una gran ciudad del Reino Unido", + "El gato come pescado", + "Veo al hombre con el telescopio", + "La araña come moscas", + "El pingüino incuba en su nido" +] diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py new file mode 100644 index 000000000..08409ea61 --- /dev/null +++ b/spacy/lang/fr/examples.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.fr.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard", + "Les voitures autonomes voient leur assurances décalées vers les constructeurs", + "San Francisco envisage d'interdire les robots coursiers", + "Londres est une grande ville du Royaume-Uni", + "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe", + "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon", + "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule", + "Nouvelles attaques de Trump contre le maire de Londres", + "Où es-tu ?", + "Qui est le président de la France ?", + "Où est la capitale des Etats-Unis ?", + "Quand est né Barack Obama ?" +] diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py new file mode 100644 index 000000000..f99f4814b --- /dev/null +++ b/spacy/lang/he/examples.py @@ -0,0 +1,28 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.he.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + 'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל', + 'רה"מ הודיע כי יחרים טקס בחסותו', + 'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100', + 'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית', + 'סע לשלום, המפתחות בפנים.', + 'מלצר, פעמיים טורקי!', + 'ואהבת לרעך כמוך.', + 'היום נעשה משהו בלתי נשכח.', + 'איפה הילד?', + 'מיהו נשיא צרפת?', + 'מהי בירת ארצות הברית?', + "איך קוראים בעברית לצ'ופצ'יק של הקומקום?", + 'מה הייתה הדקה?', + 'מי אומר שלום ראשון, זה שעולה או זה שיורד?' +] diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py new file mode 100644 index 000000000..d35b9f834 --- /dev/null +++ b/spacy/lang/it/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.it.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari", + "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori", + "San Francisco prevede di bandire i robot di consegna porta a porta", + "Londra è una grande città del Regno Unito." +] diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py new file mode 100644 index 000000000..0dc5c8144 --- /dev/null +++ b/spacy/lang/nb/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.nb.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar", + "Selvkjørende biler flytter forsikringsansvaret over på produsentene ", + "San Francisco vurderer å forby robotbud på fortauene", + "London er en stor by i Storbritannia." +] diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py new file mode 100644 index 000000000..af6c72af0 --- /dev/null +++ b/spacy/lang/pl/examples.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.pl.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Poczuł przyjemną woń mocnej kawy.", + "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.", + "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.", + "Nowy abonament pod lupą Komisji Europejskiej", + "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", + "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”." +] diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py new file mode 100644 index 000000000..239929215 --- /dev/null +++ b/spacy/lang/pt/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.pt.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", + "Carros autônomos empurram a responsabilidade do seguro para os fabricantes." + "São Francisco considera banir os robôs de entrega que andam pelas calçadas", + "Londres é a maior cidade do Reino Unido" +] diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py new file mode 100644 index 000000000..be279c4bd --- /dev/null +++ b/spacy/lang/sv/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.sv.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple överväger att köpa brittisk startup för 1 miljard dollar.", + "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.", + "San Fransisco överväger förbud mot leveransrobotar på trottoarer.". + "London är en storstad i Storbritannien." +] diff --git a/spacy/language.py b/spacy/language.py index 0284c4636..ed880d9ca 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -95,7 +95,7 @@ class BaseDefaults(object): meta = nlp.meta if nlp is not None else {} # Resolve strings, like "cnn", "lstm", etc pipeline = [] - for entry in cls.pipeline: + for entry in meta.get('pipeline', []): if entry in disable or getattr(entry, 'name', entry) in disable: continue factory = cls.Defaults.factories[entry] @@ -277,7 +277,8 @@ class Language(object): def make_doc(self, text): return self.tokenizer(text) - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, docs, golds, drop=0., sgd=None, losses=None, + update_tensors=False): """Update the models in the pipeline. docs (iterable): A batch of `Doc` objects. @@ -304,14 +305,17 @@ class Language(object): grads[key] = (W, dW) pipes = list(self.pipeline[1:]) random.shuffle(pipes) + tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) + all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses] for proc in pipes: if not hasattr(proc, 'update'): continue - tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) d_tokvecses = proc.update((docs, tokvecses), golds, drop=drop, sgd=get_grads, losses=losses) - if d_tokvecses is not None: - bp_tokvecses(d_tokvecses, sgd=sgd) + if update_tensors and d_tokvecses is not None: + for i, d_tv in enumerate(d_tokvecses): + all_d_tokvecses[i] += d_tv + bp_tokvecses(all_d_tokvecses, sgd=sgd) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) # Clear the tensor variable, to free GPU memory. @@ -381,9 +385,18 @@ class Language(object): return optimizer def evaluate(self, docs_golds): - docs, golds = zip(*docs_golds) scorer = Scorer() - for doc, gold in zip(self.pipe(docs, batch_size=32), golds): + docs, golds = zip(*docs_golds) + docs = list(docs) + golds = list(golds) + for pipe in self.pipeline: + if not hasattr(pipe, 'pipe'): + for doc in docs: + pipe(doc) + else: + docs = list(pipe.pipe(docs)) + assert len(docs) == len(golds) + for doc, gold in zip(docs, golds): scorer.score(doc, gold) doc.tensor = None return scorer @@ -417,11 +430,16 @@ class Language(object): except StopIteration: pass - def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]): + def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000, + disable=[]): """Process texts as a stream, and yield `Doc` objects in order. Supports GIL-free multi-threading. texts (iterator): A sequence of texts to process. + as_tuples (bool): + If set to True, inputs should be a sequence of + (text, context) tuples. Output will then be a sequence of + (doc, context) tuples. Defaults to False. n_threads (int): The number of worker threads to use. If -1, OpenMP will decide how many to use at run time. Default is 2. batch_size (int): The number of texts to buffer. @@ -433,7 +451,7 @@ class Language(object): >>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4): >>> assert doc.is_parsed """ - if tuples: + if as_tuples: text_context1, text_context2 = itertools.tee(texts) texts = (tc[0] for tc in text_context1) contexts = (tc[1] for tc in text_context2) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 947f0a1f1..634d3e4b5 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -42,7 +42,7 @@ from .compat import json_dumps from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats -from ._ml import build_text_classifier +from ._ml import build_text_classifier, build_tagger_model from .parts_of_speech import X @@ -138,7 +138,7 @@ class TokenVectorEncoder(BaseThincComponent): name = 'tensorizer' @classmethod - def Model(cls, width=128, embed_size=7500, **cfg): + def Model(cls, width=128, embed_size=4000, **cfg): """Create a new statistical model for the class. width (int): Output size of the model. @@ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent): self.cfg = dict(cfg) def __call__(self, doc): - tags = self.predict([doc.tensor]) + tags = self.predict(([doc], [doc.tensor])) self.set_annotations([doc], tags) return doc def pipe(self, stream, batch_size=128, n_threads=-1): for docs in cytoolz.partition_all(batch_size, stream): + docs = list(docs) tokvecs = [d.tensor for d in docs] - tag_ids = self.predict(tokvecs) + tag_ids = self.predict((docs, tokvecs)) self.set_annotations(docs, tag_ids) yield from docs - def predict(self, tokvecs): - scores = self.model(tokvecs) + def predict(self, docs_tokvecs): + scores = self.model(docs_tokvecs) scores = self.model.ops.flatten(scores) guesses = scores.argmax(axis=1) if not isinstance(guesses, numpy.ndarray): guesses = guesses.get() + tokvecs = docs_tokvecs[1] guesses = self.model.ops.unflatten(guesses, [tv.shape[0] for tv in tokvecs]) return guesses @@ -282,6 +284,8 @@ class NeuralTagger(BaseThincComponent): cdef Vocab vocab = self.vocab for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[i] + if hasattr(doc_tag_ids, 'get'): + doc_tag_ids = doc_tag_ids.get() for j, tag_id in enumerate(doc_tag_ids): # Don't clobber preset POS tags if doc.c[j].tag == 0 and doc.c[j].pos == 0: @@ -294,8 +298,7 @@ class NeuralTagger(BaseThincComponent): if self.model.nI is None: self.model.nI = tokvecs[0].shape[1] - - tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop) + tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) @@ -346,10 +349,8 @@ class NeuralTagger(BaseThincComponent): @classmethod def Model(cls, n_tags, token_vector_width): - return with_flatten( - chain(Maxout(token_vector_width, token_vector_width), - Softmax(n_tags, token_vector_width))) - + return build_tagger_model(n_tags, token_vector_width) + def use_params(self, params): with self.model.use_params(params): yield @@ -432,7 +433,7 @@ class NeuralLabeller(NeuralTagger): @property def labels(self): - return self.cfg.get('labels', {}) + return self.cfg.setdefault('labels', {}) @labels.setter def labels(self, value): @@ -455,10 +456,8 @@ class NeuralLabeller(NeuralTagger): @classmethod def Model(cls, n_tags, token_vector_width): - return with_flatten( - chain(Maxout(token_vector_width, token_vector_width), - Softmax(n_tags, token_vector_width))) - + return build_tagger_model(n_tags, token_vector_width) + def get_loss(self, docs, golds, scores): scores = self.model.ops.flatten(scores) cdef int idx = 0 diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 2e42b9667..6f676c79a 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -215,7 +215,10 @@ cdef class StringStore: path = util.ensure_path(path) with path.open('r') as file_: strings = ujson.load(file_) + prev = list(self) self._reset_and_load(strings) + for word in prev: + self.add(word) return self def to_bytes(self, **exclude): @@ -234,7 +237,10 @@ cdef class StringStore: RETURNS (StringStore): The `StringStore` object. """ strings = ujson.loads(bytes_data) + prev = list(self) self._reset_and_load(strings) + for word in prev: + self.add(word) return self def set_frozen(self, bint is_frozen): diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx new file mode 100644 index 000000000..4d90fe23b --- /dev/null +++ b/spacy/syntax/_beam_utils.pyx @@ -0,0 +1,286 @@ +# cython: infer_types=True +# cython: profile=True +cimport numpy as np +import numpy +from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF +from thinc.extra.search cimport Beam +from thinc.extra.search import MaxViolation +from thinc.typedefs cimport hash_t, class_t +from thinc.extra.search cimport MaxViolation + +from .transition_system cimport TransitionSystem, Transition +from .stateclass cimport StateClass +from ..gold cimport GoldParse +from ..tokens.doc cimport Doc + + +# These are passed as callbacks to thinc.search.Beam +cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: + dest = _dest + src = _src + moves = _moves + dest.clone(src) + moves[clas].do(dest.c, moves[clas].label) + + +cdef int _check_final_state(void* _state, void* extra_args) except -1: + return (_state).is_final() + + +def _cleanup(Beam beam): + for i in range(beam.width): + Py_XDECREF(beam._states[i].content) + Py_XDECREF(beam._parents[i].content) + + +cdef hash_t _hash_state(void* _state, void* _) except 0: + state = _state + if state.c.is_final(): + return 1 + else: + return state.c.hash() + + +cdef class ParserBeam(object): + cdef public TransitionSystem moves + cdef public object states + cdef public object golds + cdef public object beams + cdef public object dones + + def __init__(self, TransitionSystem moves, states, golds, + int width, float density): + self.moves = moves + self.states = states + self.golds = golds + self.beams = [] + cdef Beam beam + cdef StateClass state, st + for state in states: + beam = Beam(self.moves.n_moves, width, density) + beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent) + for i in range(beam.width): + st = beam.at(i) + st.c.offset = state.c.offset + self.beams.append(beam) + self.dones = [False] * len(self.beams) + + def __dealloc__(self): + if self.beams is not None: + for beam in self.beams: + if beam is not None: + _cleanup(beam) + + @property + def is_done(self): + return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams)) + + def __getitem__(self, i): + return self.beams[i] + + def __len__(self): + return len(self.beams) + + def advance(self, scores, follow_gold=False): + cdef Beam beam + for i, beam in enumerate(self.beams): + if beam.is_done or not scores[i].size or self.dones[i]: + continue + self._set_scores(beam, scores[i]) + if self.golds is not None: + self._set_costs(beam, self.golds[i], follow_gold=follow_gold) + if follow_gold: + beam.advance(_transition_state, NULL, self.moves.c) + else: + beam.advance(_transition_state, _hash_state, self.moves.c) + beam.check_done(_check_final_state, NULL) + if beam.is_done and self.golds is not None: + for j in range(beam.size): + state = beam.at(j) + if state.is_final(): + try: + if self.moves.is_gold_parse(state, self.golds[i]): + beam._states[j].loss = 0.0 + elif beam._states[j].loss == 0.0: + beam._states[j].loss = 1.0 + except NotImplementedError: + break + + def _set_scores(self, Beam beam, float[:, ::1] scores): + cdef float* c_scores = &scores[0, 0] + cdef int nr_state = min(scores.shape[0], beam.size) + cdef int nr_class = scores.shape[1] + for i in range(nr_state): + state = beam.at(i) + if not state.is_final(): + for j in range(nr_class): + beam.scores[i][j] = c_scores[i * nr_class + j] + self.moves.set_valid(beam.is_valid[i], state.c) + else: + for j in range(beam.nr_class): + beam.scores[i][j] = 0 + beam.costs[i][j] = 0 + + def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False): + for i in range(beam.size): + state = beam.at(i) + if not state.c.is_final(): + self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) + if follow_gold: + for j in range(beam.nr_class): + if beam.costs[i][j] >= 1: + beam.is_valid[i][j] = 0 + + +def get_token_ids(states, int n_tokens): + cdef StateClass state + cdef np.ndarray ids = numpy.zeros((len(states), n_tokens), + dtype='int32', order='C') + c_ids = ids.data + for i, state in enumerate(states): + if not state.is_final(): + state.c.set_context_tokens(c_ids, n_tokens) + else: + ids[i] = -1 + c_ids += ids.shape[1] + return ids + +nr_update = 0 +def update_beam(TransitionSystem moves, int nr_feature, int max_steps, + states, tokvecs, golds, + state2vec, vec2scores, + int width, float density, + sgd=None, losses=None, drop=0.): + global nr_update + cdef MaxViolation violn + nr_update += 1 + pbeam = ParserBeam(moves, states, golds, + width=width, density=density) + gbeam = ParserBeam(moves, states, golds, + width=width, density=0.0) + cdef StateClass state + beam_maps = [] + backprops = [] + violns = [MaxViolation() for _ in range(len(states))] + for t in range(max_steps): + if pbeam.is_done and gbeam.is_done: + break + # The beam maps let us find the right row in the flattened scores + # arrays for each state. States are identified by (example id, history). + # We keep a different beam map for each step (since we'll have a flat + # scores array for each step). The beam map will let us take the per-state + # losses, and compute the gradient for each (step, state, class). + beam_maps.append({}) + # Gather all states from the two beams in a list. Some stats may occur + # in both beams. To figure out which beam each state belonged to, + # we keep two lists of indices, p_indices and g_indices + states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update) + if not states: + break + # Now that we have our flat list of states, feed them through the model + token_ids = get_token_ids(states, nr_feature) + vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) + scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) + + # Store the callbacks for the backward pass + backprops.append((token_ids, bp_vectors, bp_scores)) + + # Unpack the flat scores into lists for the two beams. The indices arrays + # tell us which example and state the scores-row refers to. + p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices] + g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in g_indices] + # Now advance the states in the beams. The gold beam is contrained to + # to follow only gold analyses. + pbeam.advance(p_scores) + gbeam.advance(g_scores, follow_gold=True) + # Track the "maximum violation", to use in the update. + for i, violn in enumerate(violns): + violn.check_crf(pbeam[i], gbeam[i]) + histories = [] + losses = [] + for violn in violns: + if violn.p_hist: + histories.append(violn.p_hist + violn.g_hist) + losses.append(violn.p_probs + violn.g_probs) + else: + histories.append([]) + losses.append([]) + states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses) + return states_d_scores, backprops[:len(states_d_scores)] + + +def get_states(pbeams, gbeams, beam_map, nr_update): + seen = {} + states = [] + p_indices = [] + g_indices = [] + cdef Beam pbeam, gbeam + assert len(pbeams) == len(gbeams) + for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)): + p_indices.append([]) + g_indices.append([]) + for i in range(pbeam.size): + state = pbeam.at(i) + if not state.is_final(): + key = tuple([eg_id] + pbeam.histories[i]) + assert key not in seen, (key, seen) + seen[key] = len(states) + p_indices[-1].append(len(states)) + states.append(state) + beam_map.update(seen) + for i in range(gbeam.size): + state = gbeam.at(i) + if not state.is_final(): + key = tuple([eg_id] + gbeam.histories[i]) + if key in seen: + g_indices[-1].append(seen[key]) + else: + g_indices[-1].append(len(states)) + beam_map[key] = len(states) + states.append(state) + p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices] + g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices] + return states, p_idx, g_idx + + +def get_gradient(nr_class, beam_maps, histories, losses): + """ + The global model assigns a loss to each parse. The beam scores + are additive, so the same gradient is applied to each action + in the history. This gives the gradient of a single *action* + for a beam state -- so we have "the gradient of loss for taking + action i given history H." + + Histories: Each hitory is a list of actions + Each candidate has a history + Each beam has multiple candidates + Each batch has multiple beams + So history is list of lists of lists of ints + """ + nr_step = len(beam_maps) + grads = [] + nr_step = 0 + for eg_id, hists in enumerate(histories): + for loss, hist in zip(losses[eg_id], hists): + if loss != 0.0 and not numpy.isnan(loss): + nr_step = max(nr_step, len(hist)) + for i in range(nr_step): + grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f')) + assert len(histories) == len(losses) + for eg_id, hists in enumerate(histories): + for loss, hist in zip(losses[eg_id], hists): + if loss == 0.0 or numpy.isnan(loss): + continue + key = tuple([eg_id]) + # Adjust loss for length + avg_loss = loss / len(hist) + loss += avg_loss * (nr_step - len(hist)) + for j, clas in enumerate(hist): + i = beam_maps[j][key] + # In step j, at state i action clas + # resulted in loss + grads[j][i, clas] += loss + key = key + tuple([clas]) + return grads + + diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index c06851978..3da9e5d4c 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -37,6 +37,7 @@ cdef cppclass StateC: this.shifted = calloc(length + (PADDING * 2), sizeof(bint)) this._sent = calloc(length + (PADDING * 2), sizeof(TokenC)) this._ents = calloc(length + (PADDING * 2), sizeof(Entity)) + this.offset = 0 cdef int i for i in range(length + (PADDING * 2)): this._ents[i].end = -1 @@ -73,7 +74,16 @@ cdef cppclass StateC: free(this.shifted - PADDING) void set_context_tokens(int* ids, int n) nogil: - if n == 13: + if n == 8: + ids[0] = this.B(0) + ids[1] = this.B(1) + ids[2] = this.S(0) + ids[3] = this.S(1) + ids[4] = this.H(this.S(0)) + ids[5] = this.L(this.B(0), 1) + ids[6] = this.L(this.S(0), 2) + ids[7] = this.R(this.S(0), 1) + elif n == 13: ids[0] = this.B(0) ids[1] = this.B(1) ids[2] = this.S(0) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 29e8de0aa..aab350d76 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -351,6 +351,20 @@ cdef class ArcEager(TransitionSystem): def __get__(self): return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) + def is_gold_parse(self, StateClass state, GoldParse gold): + predicted = set() + truth = set() + for i in range(gold.length): + if gold.cand_to_gold[i] is None: + continue + if state.safe_get(i).dep: + predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep])) + else: + predicted.add((i, state.H(i), 'ROOT')) + id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] + truth.add((id_, head, dep)) + return truth == predicted + def has_gold(self, GoldParse gold, start=0, end=None): end = end or len(gold.heads) if all([tag is None for tag in gold.heads[start:end]]): @@ -385,6 +399,7 @@ cdef class ArcEager(TransitionSystem): for i in range(self.n_moves): if self.c[i].move == move and self.c[i].label == label: return self.c[i] + return Transition(clas=0, move=MISSING, label=0) def move_name(self, int move, attr_t label): label_str = self.strings[label] diff --git a/spacy/syntax/beam_parser.pyx b/spacy/syntax/beam_parser.pyx index e96e28fcf..68e9f27af 100644 --- a/spacy/syntax/beam_parser.pyx +++ b/spacy/syntax/beam_parser.pyx @@ -107,7 +107,7 @@ cdef class BeamParser(Parser): # The non-monotonic oracle makes it difficult to ensure final costs are # correct. Therefore do final correction for i in range(pred.size): - if is_gold(pred.at(i), gold_parse, self.moves.strings): + if self.moves.is_gold_parse(pred.at(i), gold_parse): pred._states[i].loss = 0.0 elif pred._states[i].loss == 0.0: pred._states[i].loss = 1.0 @@ -213,7 +213,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio if not pred._states[i].is_done or pred._states[i].loss == 0: continue state = pred.at(i) - if is_gold(state, gold_parse, moves.strings) == True: + if moves.is_gold_parse(state, gold_parse) == True: for dep in gold_parse.orig_annot: print(dep[1], dep[3], dep[4]) print("Cost", pred._states[i].loss) @@ -227,7 +227,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio if not gold._states[i].is_done: continue state = gold.at(i) - if is_gold(state, gold_parse, moves.strings) == False: + if moves.is_gold(state, gold_parse) == False: print("Truth") for dep in gold_parse.orig_annot: print(dep[1], dep[3], dep[4]) @@ -237,16 +237,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio raise Exception("Gold parse is not gold-standard") -def is_gold(StateClass state, GoldParse gold, StringStore strings): - predicted = set() - truth = set() - for i in range(gold.length): - if gold.cand_to_gold[i] is None: - continue - if state.safe_get(i).dep: - predicted.add((i, state.H(i), strings[state.safe_get(i).dep])) - else: - predicted.add((i, state.H(i), 'ROOT')) - id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] - truth.add((id_, head, dep)) - return truth == predicted diff --git a/spacy/syntax/nn_parser.pxd b/spacy/syntax/nn_parser.pxd index 524718965..7ff4b9f9f 100644 --- a/spacy/syntax/nn_parser.pxd +++ b/spacy/syntax/nn_parser.pxd @@ -14,8 +14,4 @@ cdef class Parser: cdef readonly TransitionSystem moves cdef readonly object cfg - cdef void _parse_step(self, StateC* state, - const float* feat_weights, - int nr_class, int nr_feat, int nr_piece) nogil - #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 0b39e2216..7412ebeee 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -37,14 +37,18 @@ from preshed.maps cimport MapStruct from preshed.maps cimport map_get from thinc.api import layerize, chain, noop, clone -from thinc.neural import Model, Affine, ELU, ReLu, Maxout +from thinc.neural import Model, Affine, ReLu, Maxout +from thinc.neural._classes.batchnorm import BatchNorm as BN +from thinc.neural._classes.selu import SELU +from thinc.neural._classes.layernorm import LayerNorm from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module from .. import util from ..util import get_async, get_cuda_stream from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts -from .._ml import Tok2Vec, doc2feats, rebatch +from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune +from .._ml import Residual, drop_layer from ..compat import json_dumps from . import _parse_features @@ -59,8 +63,10 @@ from ..structs cimport TokenC from ..tokens.doc cimport Doc from ..strings cimport StringStore from ..gold cimport GoldParse -from ..attrs cimport TAG, DEP +from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG +from . import _beam_utils +USE_FINE_TUNE = True def get_templates(*args, **kwargs): return [] @@ -232,11 +238,14 @@ cdef class Parser: Base class of the DependencyParser and EntityRecognizer. """ @classmethod - def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg): + def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg): depth = util.env_opt('parser_hidden_depth', depth) token_vector_width = util.env_opt('token_vector_width', token_vector_width) hidden_width = util.env_opt('hidden_width', hidden_width) parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) + embed_size = util.env_opt('embed_size', 4000) + tensors = fine_tune(Tok2Vec(token_vector_width, embed_size, + preprocess=doc2feats())) if parser_maxout_pieces == 1: lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, nF=cls.nr_feature, @@ -248,15 +257,10 @@ cdef class Parser: nI=token_vector_width) with Model.use_device('cpu'): - if depth == 0: - upper = chain() - upper.is_noop = True - else: - upper = chain( - clone(Maxout(hidden_width), (depth-1)), - zero_init(Affine(nr_class, drop_factor=0.0)) - ) - upper.is_noop = False + upper = chain( + clone(Maxout(hidden_width), (depth-1)), + zero_init(Affine(nr_class, drop_factor=0.0)) + ) # TODO: This is an unfortunate hack atm! # Used to set input dimensions in network. lower.begin_training(lower.ops.allocate((500, token_vector_width))) @@ -268,7 +272,7 @@ cdef class Parser: 'hidden_width': hidden_width, 'maxout_pieces': parser_maxout_pieces } - return (lower, upper), cfg + return (tensors, lower, upper), cfg def __init__(self, Vocab vocab, moves=True, model=True, **cfg): """ @@ -294,6 +298,10 @@ cdef class Parser: self.moves = self.TransitionSystem(self.vocab.strings, {}) else: self.moves = moves + if 'beam_width' not in cfg: + cfg['beam_width'] = util.env_opt('beam_width', 1) + if 'beam_density' not in cfg: + cfg['beam_density'] = util.env_opt('beam_density', 0.0) self.cfg = cfg if 'actions' in self.cfg: for action, labels in self.cfg.get('actions', {}).items(): @@ -316,7 +324,7 @@ cdef class Parser: if beam_width is None: beam_width = self.cfg.get('beam_width', 1) if beam_density is None: - beam_density = self.cfg.get('beam_density', 0.001) + beam_density = self.cfg.get('beam_density', 0.0) cdef Beam beam if beam_width == 1: states = self.parse_batch([doc], [doc.tensor]) @@ -332,7 +340,7 @@ cdef class Parser: return output def pipe(self, docs, int batch_size=1000, int n_threads=2, - beam_width=1, beam_density=0.001): + beam_width=None, beam_density=None): """ Process a stream of documents. @@ -344,17 +352,23 @@ cdef class Parser: The number of threads with which to work on the buffer in parallel. Yields (Doc): Documents, in order. """ - cdef StateClass parse_state + if beam_width is None: + beam_width = self.cfg.get('beam_width', 1) + if beam_density is None: + beam_density = self.cfg.get('beam_density', 0.0) cdef Doc doc - queue = [] + cdef Beam beam for docs in cytoolz.partition_all(batch_size, docs): docs = list(docs) - tokvecs = [d.tensor for d in docs] + tokvecs = [doc.tensor for doc in docs] if beam_width == 1: parse_states = self.parse_batch(docs, tokvecs) else: - parse_states = self.beam_parse(docs, tokvecs, - beam_width=beam_width, beam_density=beam_density) + beams = self.beam_parse(docs, tokvecs, + beam_width=beam_width, beam_density=beam_density) + parse_states = [] + for beam in beams: + parse_states.append(beam.at(0)) self.set_annotations(docs, parse_states) yield from docs @@ -369,8 +383,12 @@ cdef class Parser: int nr_class, nr_feat, nr_piece, nr_dim, nr_state if isinstance(docs, Doc): docs = [docs] + if isinstance(tokvecses, np.ndarray): + tokvecses = [tokvecses] tokvecs = self.model[0].ops.flatten(tokvecses) + if USE_FINE_TUNE: + tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) nr_state = len(docs) nr_class = self.moves.n_moves @@ -394,27 +412,20 @@ cdef class Parser: cdef np.ndarray scores c_token_ids = token_ids.data c_is_valid = is_valid.data - cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) while not next_step.empty(): - if not has_hidden: - for i in cython.parallel.prange( - next_step.size(), num_threads=6, nogil=True): - self._parse_step(next_step[i], - feat_weights, nr_class, nr_feat, nr_piece) - else: - for i in range(next_step.size()): - st = next_step[i] - st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) - self.moves.set_valid(&c_is_valid[i*nr_class], st) - vectors = state2vec(token_ids[:next_step.size()]) - scores = vec2scores(vectors) - c_scores = scores.data - for i in range(next_step.size()): - st = next_step[i] - guess = arg_max_if_valid( - &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) - action = self.moves.c[guess] - action.do(st, action.label) + for i in range(next_step.size()): + st = next_step[i] + st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) + self.moves.set_valid(&c_is_valid[i*nr_class], st) + vectors = state2vec(token_ids[:next_step.size()]) + scores = vec2scores(vectors) + c_scores = scores.data + for i in range(next_step.size()): + st = next_step[i] + guess = arg_max_if_valid( + &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) + action = self.moves.c[guess] + action.do(st, action.label) this_step, next_step = next_step, this_step next_step.clear() for st in this_step: @@ -422,18 +433,22 @@ cdef class Parser: next_step.push_back(st) return states - def beam_parse(self, docs, tokvecses, int beam_width=8, float beam_density=0.001): + def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001): cdef Beam beam cdef np.ndarray scores cdef Doc doc cdef int nr_class = self.moves.n_moves cdef StateClass stcls, output tokvecs = self.model[0].ops.flatten(tokvecses) + if USE_FINE_TUNE: + tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) cuda_stream = get_cuda_stream() state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, cuda_stream, 0.0) beams = [] cdef int offset = 0 + cdef int j = 0 + cdef int k for doc in docs: beam = Beam(nr_class, beam_width, min_density=beam_density) beam.initialize(self.moves.init_beam_state, doc.length, doc.c) @@ -446,44 +461,32 @@ cdef class Parser: states = [] for i in range(beam.size): stcls = beam.at(i) - states.append(stcls) + # This way we avoid having to score finalized states + # We do have to take care to keep indexes aligned, though + if not stcls.is_final(): + states.append(stcls) token_ids = self.get_token_ids(states) vectors = state2vec(token_ids) scores = vec2scores(vectors) + j = 0 + c_scores = scores.data for i in range(beam.size): stcls = beam.at(i) if not stcls.is_final(): self.moves.set_valid(beam.is_valid[i], stcls.c) - for j in range(nr_class): - beam.scores[i][j] = scores[i, j] + for k in range(nr_class): + beam.scores[i][k] = c_scores[j * scores.shape[1] + k] + j += 1 beam.advance(_transition_state, _hash_state, self.moves.c) beam.check_done(_check_final_state, NULL) beams.append(beam) return beams - cdef void _parse_step(self, StateC* state, - const float* feat_weights, - int nr_class, int nr_feat, int nr_piece) nogil: - '''This only works with no hidden layers -- fast but inaccurate''' - #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True): - # self._parse_step(next_step[i], feat_weights, nr_class, nr_feat) - token_ids = calloc(nr_feat, sizeof(int)) - scores = calloc(nr_class * nr_piece, sizeof(float)) - is_valid = calloc(nr_class, sizeof(int)) - - state.set_context_tokens(token_ids, nr_feat) - sum_state_features(scores, - feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece) - self.moves.set_valid(is_valid, state) - guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece) - action = self.moves.c[guess] - action.do(state, action.label) - - free(is_valid) - free(scores) - free(token_ids) - def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): + if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: + return self.update_beam(docs_tokvecs, golds, + self.cfg['beam_width'], self.cfg['beam_density'], + drop=drop, sgd=sgd, losses=losses) if losses is not None and self.name not in losses: losses[self.name] = 0. docs, tokvec_lists = docs_tokvecs @@ -491,6 +494,10 @@ cdef class Parser: if isinstance(docs, Doc) and isinstance(golds, GoldParse): docs = [docs] golds = [golds] + if USE_FINE_TUNE: + my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) + my_tokvecs = self.model[0].ops.flatten(my_tokvecs) + tokvecs += my_tokvecs cuda_stream = get_cuda_stream() @@ -517,13 +524,14 @@ cdef class Parser: scores, bp_scores = vec2scores.begin_update(vector, drop=drop) d_scores = self.get_batch_loss(states, golds, scores) - d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd) + d_scores /= len(docs) + d_vector = bp_scores(d_scores, sgd=sgd) if drop != 0: d_vector *= mask if isinstance(self.model[0].ops, CupyOps) \ and not isinstance(token_ids, state2vec.ops.xp.ndarray): - # Move token_ids and d_vector to CPU, asynchronously + # Move token_ids and d_vector to GPU, asynchronously backprops.append(( get_async(cuda_stream, token_ids), get_async(cuda_stream, d_vector), @@ -540,7 +548,62 @@ cdef class Parser: break self._make_updates(d_tokvecs, backprops, sgd, cuda_stream) - return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) + d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) + if USE_FINE_TUNE: + bp_my_tokvecs(d_tokvecs, sgd=sgd) + return d_tokvecs + + def update_beam(self, docs_tokvecs, golds, width=None, density=None, + drop=0., sgd=None, losses=None): + if width is None: + width = self.cfg.get('beam_width', 2) + if density is None: + density = self.cfg.get('beam_density', 0.0) + if losses is not None and self.name not in losses: + losses[self.name] = 0. + docs, tokvecs = docs_tokvecs + lengths = [len(d) for d in docs] + assert min(lengths) >= 1 + tokvecs = self.model[0].ops.flatten(tokvecs) + if USE_FINE_TUNE: + my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) + my_tokvecs = self.model[0].ops.flatten(my_tokvecs) + tokvecs += my_tokvecs + + states = self.moves.init_batch(docs) + for gold in golds: + self.moves.preprocess_gold(gold) + + cuda_stream = get_cuda_stream() + state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) + + states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, + states, tokvecs, golds, + state2vec, vec2scores, + width, density, + sgd=sgd, drop=drop, losses=losses) + backprop_lower = [] + cdef float batch_size = len(docs) + for i, d_scores in enumerate(states_d_scores): + d_scores /= batch_size + if losses is not None: + losses[self.name] += (d_scores**2).sum() + ids, bp_vectors, bp_scores = backprops[i] + d_vector = bp_scores(d_scores, sgd=sgd) + if isinstance(self.model[0].ops, CupyOps) \ + and not isinstance(ids, state2vec.ops.xp.ndarray): + backprop_lower.append(( + get_async(cuda_stream, ids), + get_async(cuda_stream, d_vector), + bp_vectors)) + else: + backprop_lower.append((ids, d_vector, bp_vectors)) + d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) + self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) + d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths) + if USE_FINE_TUNE: + bp_my_tokvecs(d_tokvecs, sgd=sgd) + return d_tokvecs def _init_gold_batch(self, whole_docs, whole_golds): """Make a square batch, of length equal to the shortest doc. A long @@ -585,14 +648,10 @@ cdef class Parser: xp = get_array_module(d_tokvecs) for ids, d_vector, bp_vector in backprops: d_state_features = bp_vector(d_vector, sgd=sgd) - active_feats = ids * (ids >= 0) - active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1)) - if hasattr(xp, 'scatter_add'): - xp.scatter_add(d_tokvecs, - ids, d_state_features * active_feats) - else: - xp.add.at(d_tokvecs, - ids, d_state_features * active_feats) + mask = ids >= 0 + d_state_features *= mask.reshape(ids.shape + (1,)) + self.model[0].ops.scatter_add(d_tokvecs, ids * mask, + d_state_features) @property def move_names(self): @@ -603,12 +662,12 @@ cdef class Parser: return names def get_batch_model(self, batch_size, tokvecs, stream, dropout): - lower, upper = self.model + _, lower, upper = self.model state2vec = precompute_hiddens(batch_size, tokvecs, lower, stream, drop=dropout) return state2vec, upper - nr_feature = 13 + nr_feature = 8 def get_token_ids(self, states): cdef StateClass state @@ -693,10 +752,12 @@ cdef class Parser: def to_disk(self, path, **exclude): serializers = { - 'lower_model': lambda p: p.open('wb').write( + 'tok2vec_model': lambda p: p.open('wb').write( self.model[0].to_bytes()), - 'upper_model': lambda p: p.open('wb').write( + 'lower_model': lambda p: p.open('wb').write( self.model[1].to_bytes()), + 'upper_model': lambda p: p.open('wb').write( + self.model[2].to_bytes()), 'vocab': lambda p: self.vocab.to_disk(p), 'moves': lambda p: self.moves.to_disk(p, strings=False), 'cfg': lambda p: p.open('w').write(json_dumps(self.cfg)) @@ -717,24 +778,29 @@ cdef class Parser: self.model, cfg = self.Model(**self.cfg) else: cfg = {} - with (path / 'lower_model').open('rb') as file_: + with (path / 'tok2vec_model').open('rb') as file_: bytes_data = file_.read() self.model[0].from_bytes(bytes_data) - with (path / 'upper_model').open('rb') as file_: + with (path / 'lower_model').open('rb') as file_: bytes_data = file_.read() self.model[1].from_bytes(bytes_data) + with (path / 'upper_model').open('rb') as file_: + bytes_data = file_.read() + self.model[2].from_bytes(bytes_data) self.cfg.update(cfg) return self def to_bytes(self, **exclude): serializers = OrderedDict(( - ('lower_model', lambda: self.model[0].to_bytes()), - ('upper_model', lambda: self.model[1].to_bytes()), + ('tok2vec_model', lambda: self.model[0].to_bytes()), + ('lower_model', lambda: self.model[1].to_bytes()), + ('upper_model', lambda: self.model[2].to_bytes()), ('vocab', lambda: self.vocab.to_bytes()), ('moves', lambda: self.moves.to_bytes(strings=False)), ('cfg', lambda: ujson.dumps(self.cfg)) )) if 'model' in exclude: + exclude['tok2vec_model'] = True exclude['lower_model'] = True exclude['upper_model'] = True exclude.pop('model') @@ -745,6 +811,7 @@ cdef class Parser: ('vocab', lambda b: self.vocab.from_bytes(b)), ('moves', lambda b: self.moves.from_bytes(b, strings=False)), ('cfg', lambda b: self.cfg.update(ujson.loads(b))), + ('tok2vec_model', lambda b: None), ('lower_model', lambda b: None), ('upper_model', lambda b: None) )) @@ -754,10 +821,12 @@ cdef class Parser: self.model, cfg = self.Model(self.moves.n_moves) else: cfg = {} + if 'tok2vec_model' in msg: + self.model[0].from_bytes(msg['tok2vec_model']) if 'lower_model' in msg: - self.model[0].from_bytes(msg['lower_model']) + self.model[1].from_bytes(msg['lower_model']) if 'upper_model' in msg: - self.model[1].from_bytes(msg['upper_model']) + self.model[2].from_bytes(msg['upper_model']) self.cfg.update(cfg) return self diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 27b375bba..9cf82e0c7 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -99,6 +99,9 @@ cdef class TransitionSystem: def preprocess_gold(self, GoldParse gold): raise NotImplementedError + def is_gold_parse(self, StateClass state, GoldParse gold): + raise NotImplementedError + cdef Transition lookup_transition(self, object name) except *: raise NotImplementedError @@ -107,6 +110,8 @@ cdef class TransitionSystem: def is_valid(self, StateClass stcls, move_name): action = self.lookup_transition(move_name) + if action.move == 0: + return False return action.is_valid(stcls.c, action.label) cdef int set_valid(self, int* is_valid, const StateC* st) nogil: diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 42b55745f..30a6367c8 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -78,3 +78,16 @@ def test_predict_doc_beam(parser, tok2vec, model, doc): parser(doc, beam_width=32, beam_density=0.001) for word in doc: print(word.text, word.head, word.dep_) + + +def test_update_doc_beam(parser, tok2vec, model, doc, gold): + parser.model = model + tokvecs, bp_tokvecs = tok2vec.begin_update([doc]) + d_tokvecs = parser.update_beam(([doc], tokvecs), [gold]) + assert d_tokvecs[0].shape == tokvecs[0].shape + def optimize(weights, gradient, key=None): + weights -= 0.001 * gradient + bp_tokvecs(d_tokvecs, sgd=optimize) + assert d_tokvecs[0].sum() == 0. + + diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py new file mode 100644 index 000000000..ab8bf012b --- /dev/null +++ b/spacy/tests/parser/test_nn_beam.py @@ -0,0 +1,87 @@ +from __future__ import unicode_literals +import pytest +import numpy +from thinc.api import layerize + +from ...vocab import Vocab +from ...syntax.arc_eager import ArcEager +from ...tokens import Doc +from ...gold import GoldParse +from ...syntax._beam_utils import ParserBeam, update_beam +from ...syntax.stateclass import StateClass + + +@pytest.fixture +def vocab(): + return Vocab() + +@pytest.fixture +def moves(vocab): + aeager = ArcEager(vocab.strings, {}) + aeager.add_action(2, 'nsubj') + aeager.add_action(3, 'dobj') + aeager.add_action(2, 'aux') + return aeager + + +@pytest.fixture +def docs(vocab): + return [Doc(vocab, words=['Rats', 'bite', 'things'])] + +@pytest.fixture +def states(docs): + return [StateClass(doc) for doc in docs] + +@pytest.fixture +def tokvecs(docs, vector_size): + output = [] + for doc in docs: + vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size)) + output.append(numpy.asarray(vec)) + return output + + +@pytest.fixture +def golds(docs): + return [GoldParse(doc) for doc in docs] + + +@pytest.fixture +def batch_size(docs): + return len(docs) + + +@pytest.fixture +def beam_width(): + return 4 + + +@pytest.fixture +def vector_size(): + return 6 + + +@pytest.fixture +def beam(moves, states, golds, beam_width): + return ParserBeam(moves, states, golds, width=beam_width, density=0.0) + +@pytest.fixture +def scores(moves, batch_size, beam_width): + return [ + numpy.asarray( + numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), + dtype='f') + for _ in range(batch_size)] + + +def test_create_beam(beam): + pass + + +def test_beam_advance(beam, scores): + beam.advance(scores) + + +def test_beam_advance_too_few_scores(beam, scores): + with pytest.raises(IndexError): + beam.advance(scores[:-1]) diff --git a/spacy/tests/regression/test_issue1257.py b/spacy/tests/regression/test_issue1257.py new file mode 100644 index 000000000..de6b014a6 --- /dev/null +++ b/spacy/tests/regression/test_issue1257.py @@ -0,0 +1,12 @@ +'''Test tokens compare correctly''' +from __future__ import unicode_literals + +from ..util import get_doc +from ...vocab import Vocab + + +def test_issue1257(): + doc1 = get_doc(Vocab(), ['a', 'b', 'c']) + doc2 = get_doc(Vocab(), ['a', 'c', 'e']) + assert doc1[0] != doc2[0] + assert not doc1[0] == doc2[0] diff --git a/spacy/tests/serialize/test_serialize_tagger.py b/spacy/tests/serialize/test_serialize_tagger.py index fa9a776bb..3154687c3 100644 --- a/spacy/tests/serialize/test_serialize_tagger.py +++ b/spacy/tests/serialize/test_serialize_tagger.py @@ -11,8 +11,8 @@ import pytest def taggers(en_vocab): tagger1 = Tagger(en_vocab) tagger2 = Tagger(en_vocab) - tagger1.model = tagger1.Model(None, None) - tagger2.model = tagger2.Model(None, None) + tagger1.model = tagger1.Model(8, 8) + tagger2.model = tagger1.model return (tagger1, tagger2) @@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): tagger1, tagger2 = taggers tagger1_b = tagger1.to_bytes() tagger2_b = tagger2.to_bytes() - assert tagger1_b == tagger2_b tagger1 = tagger1.from_bytes(tagger1_b) assert tagger1.to_bytes() == tagger1_b new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b) diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index d22fa52ae..7ed9333b8 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from ..util import get_doc +from ...attrs import ORTH, LENGTH import pytest @@ -89,3 +90,19 @@ def test_spans_are_hashable(en_tokenizer): span3 = tokens[0:2] assert hash(span3) == hash(span1) + +def test_spans_by_character(doc): + span1 = doc[1:-2] + span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE') + assert span1.start_char == span2.start_char + assert span1.end_char == span2.end_char + assert span2.label_ == 'GPE' + + +def test_span_to_array(doc): + span = doc[1:-2] + arr = span.to_array([ORTH, LENGTH]) + assert arr.shape == (len(span), 2) + assert arr[0, 0] == span[0].orth + assert arr[0, 1] == len(span[0]) + diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 56aeb5223..2f474a926 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors): """Add list of vector tuples to given vocab. All vectors need to have the same length. Format: [("text", [1, 2, 3])]""" length = len(vectors[0][1]) - vocab.resize_vectors(length) + vocab.clear_vectors(length) for word, vec in vectors: - vocab[word].vector = vec + vocab.set_vector(word, vec) return vocab diff --git a/spacy/tests/vectors/test_similarity.py b/spacy/tests/vectors/test_similarity.py index 1260728be..f9c18adca 100644 --- a/spacy/tests/vectors/test_similarity.py +++ b/spacy/tests/vectors/test_similarity.py @@ -14,10 +14,9 @@ def vectors(): @pytest.fixture() def vocab(en_vocab, vectors): - #return add_vecs_to_vocab(en_vocab, vectors) - return None + add_vecs_to_vocab(en_vocab, vectors) + return en_vocab -@pytest.mark.xfail def test_vectors_similarity_LL(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors lex1 = vocab[word1] @@ -31,7 +30,6 @@ def test_vectors_similarity_LL(vocab, vectors): assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1)) -@pytest.mark.xfail def test_vectors_similarity_TT(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = get_doc(vocab, words=[word1, word2]) @@ -44,21 +42,18 @@ def test_vectors_similarity_TT(vocab, vectors): assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1])) -@pytest.mark.xfail def test_vectors_similarity_TD(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = get_doc(vocab, words=[word1, word2]) assert doc.similarity(doc[0]) == doc[0].similarity(doc) -@pytest.mark.xfail def test_vectors_similarity_DS(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = get_doc(vocab, words=[word1, word2]) assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) -@pytest.mark.xfail def test_vectors_similarity_TS(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors doc = get_doc(vocab, words=[word1, word2]) diff --git a/spacy/tests/vectors/test_vectors.py b/spacy/tests/vectors/test_vectors.py index c42c3a4ce..798871edd 100644 --- a/spacy/tests/vectors/test_vectors.py +++ b/spacy/tests/vectors/test_vectors.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals from ...vectors import Vectors +from ...tokenizer import Tokenizer +from ..util import add_vecs_to_vocab, get_doc import numpy import pytest @@ -11,22 +13,42 @@ import pytest def strings(): return ["apple", "orange"] +@pytest.fixture +def vectors(): + return [ + ("apple", [1, 2, 3]), + ("orange", [-1, -2, -3]), + ('and', [-1, -1, -1]), + ('juice', [5, 5, 10]), + ('pie', [7, 6.3, 8.9])] + + @pytest.fixture def data(): return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f') +@pytest.fixture() +def vocab(en_vocab, vectors): + add_vecs_to_vocab(en_vocab, vectors) + return en_vocab + + def test_init_vectors_with_data(strings, data): v = Vectors(strings, data) assert v.shape == data.shape def test_init_vectors_with_width(strings): v = Vectors(strings, 3) + for string in strings: + v.add(string) assert v.shape == (len(strings), 3) def test_get_vector(strings, data): v = Vectors(strings, data) + for string in strings: + v.add(string) assert list(v[strings[0]]) == list(data[0]) assert list(v[strings[0]]) != list(data[1]) assert list(v[strings[1]]) != list(data[0]) @@ -35,6 +57,8 @@ def test_get_vector(strings, data): def test_set_vector(strings, data): orig = data.copy() v = Vectors(strings, data) + for string in strings: + v.add(string) assert list(v[strings[0]]) == list(orig[0]) assert list(v[strings[0]]) != list(orig[1]) v[strings[0]] = data[1] @@ -42,125 +66,111 @@ def test_set_vector(strings, data): assert list(v[strings[0]]) != list(orig[0]) -# -#@pytest.fixture() -#def tokenizer_v(vocab): -# return Tokenizer(vocab, {}, None, None, None) -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text', ["apple and orange"]) -#def test_vectors_token_vector(tokenizer_v, vectors, text): -# doc = tokenizer_v(text) -# assert vectors[0] == (doc[0].text, list(doc[0].vector)) -# assert vectors[1] == (doc[2].text, list(doc[2].vector)) -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text', ["apple", "orange"]) -#def test_vectors_lexeme_vector(vocab, text): -# lex = vocab[text] -# assert list(lex.vector) -# assert lex.vector_norm -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text', [["apple", "and", "orange"]]) -#def test_vectors_doc_vector(vocab, text): -# doc = get_doc(vocab, text) -# assert list(doc.vector) -# assert doc.vector_norm -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text', [["apple", "and", "orange"]]) -#def test_vectors_span_vector(vocab, text): -# span = get_doc(vocab, text)[0:2] -# assert list(span.vector) -# assert span.vector_norm -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text', ["apple orange"]) -#def test_vectors_token_token_similarity(tokenizer_v, text): -# doc = tokenizer_v(text) -# assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) -# assert 0.0 < doc[0].similarity(doc[1]) < 1.0 -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text1,text2', [("apple", "orange")]) -#def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2): -# token = tokenizer_v(text1) -# lex = vocab[text2] -# assert token.similarity(lex) == lex.similarity(token) -# assert 0.0 < token.similarity(lex) < 1.0 -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) -#def test_vectors_token_span_similarity(vocab, text): -# doc = get_doc(vocab, text) -# assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0]) -# assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0 -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) -#def test_vectors_token_doc_similarity(vocab, text): -# doc = get_doc(vocab, text) -# assert doc[0].similarity(doc) == doc.similarity(doc[0]) -# assert 0.0 < doc[0].similarity(doc) < 1.0 -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) -#def test_vectors_lexeme_span_similarity(vocab, text): -# doc = get_doc(vocab, text) -# lex = vocab[text[0]] -# assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex) -# assert 0.0 < doc.similarity(doc[1:3]) < 1.0 -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text1,text2', [("apple", "orange")]) -#def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2): -# lex1 = vocab[text1] -# lex2 = vocab[text2] -# assert lex1.similarity(lex2) == lex2.similarity(lex1) -# assert 0.0 < lex1.similarity(lex2) < 1.0 -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) -#def test_vectors_lexeme_doc_similarity(vocab, text): -# doc = get_doc(vocab, text) -# lex = vocab[text[0]] -# assert lex.similarity(doc) == doc.similarity(lex) -# assert 0.0 < lex.similarity(doc) < 1.0 -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) -#def test_vectors_span_span_similarity(vocab, text): -# doc = get_doc(vocab, text) -# assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) -# assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0 -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) -#def test_vectors_span_doc_similarity(vocab, text): -# doc = get_doc(vocab, text) -# assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) -# assert 0.0 < doc[0:2].similarity(doc) < 1.0 -# -# -#@pytest.mark.xfail -#@pytest.mark.parametrize('text1,text2', [ -# (["apple", "and", "apple", "pie"], ["orange", "juice"])]) -#def test_vectors_doc_doc_similarity(vocab, text1, text2): -# doc1 = get_doc(vocab, text1) -# doc2 = get_doc(vocab, text2) -# assert doc1.similarity(doc2) == doc2.similarity(doc1) -# assert 0.0 < doc1.similarity(doc2) < 1.0 + +@pytest.fixture() +def tokenizer_v(vocab): + return Tokenizer(vocab, {}, None, None, None) + + +@pytest.mark.parametrize('text', ["apple and orange"]) +def test_vectors_token_vector(tokenizer_v, vectors, text): + doc = tokenizer_v(text) + assert vectors[0] == (doc[0].text, list(doc[0].vector)) + assert vectors[1] == (doc[2].text, list(doc[2].vector)) + + +@pytest.mark.parametrize('text', ["apple", "orange"]) +def test_vectors_lexeme_vector(vocab, text): + lex = vocab[text] + assert list(lex.vector) + assert lex.vector_norm + + +@pytest.mark.parametrize('text', [["apple", "and", "orange"]]) +def test_vectors_doc_vector(vocab, text): + doc = get_doc(vocab, text) + assert list(doc.vector) + assert doc.vector_norm + + +@pytest.mark.parametrize('text', [["apple", "and", "orange"]]) +def test_vectors_span_vector(vocab, text): + span = get_doc(vocab, text)[0:2] + assert list(span.vector) + assert span.vector_norm + + +@pytest.mark.parametrize('text', ["apple orange"]) +def test_vectors_token_token_similarity(tokenizer_v, text): + doc = tokenizer_v(text) + assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) + assert -1. < doc[0].similarity(doc[1]) < 1.0 + + +@pytest.mark.parametrize('text1,text2', [("apple", "orange")]) +def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2): + token = tokenizer_v(text1) + lex = vocab[text2] + assert token.similarity(lex) == lex.similarity(token) + assert -1. < token.similarity(lex) < 1.0 + + +@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +def test_vectors_token_span_similarity(vocab, text): + doc = get_doc(vocab, text) + assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0]) + assert -1. < doc[0].similarity(doc[1:3]) < 1.0 + + +@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +def test_vectors_token_doc_similarity(vocab, text): + doc = get_doc(vocab, text) + assert doc[0].similarity(doc) == doc.similarity(doc[0]) + assert -1. < doc[0].similarity(doc) < 1.0 + + +@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +def test_vectors_lexeme_span_similarity(vocab, text): + doc = get_doc(vocab, text) + lex = vocab[text[0]] + assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex) + assert -1. < doc.similarity(doc[1:3]) < 1.0 + + +@pytest.mark.parametrize('text1,text2', [("apple", "orange")]) +def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2): + lex1 = vocab[text1] + lex2 = vocab[text2] + assert lex1.similarity(lex2) == lex2.similarity(lex1) + assert -1. < lex1.similarity(lex2) < 1.0 + + +@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +def test_vectors_lexeme_doc_similarity(vocab, text): + doc = get_doc(vocab, text) + lex = vocab[text[0]] + assert lex.similarity(doc) == doc.similarity(lex) + assert -1. < lex.similarity(doc) < 1.0 + + +@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +def test_vectors_span_span_similarity(vocab, text): + doc = get_doc(vocab, text) + assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) + assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0 + + +@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +def test_vectors_span_doc_similarity(vocab, text): + doc = get_doc(vocab, text) + assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) + assert -1. < doc[0:2].similarity(doc) < 1.0 + + +@pytest.mark.parametrize('text1,text2', [ + (["apple", "and", "apple", "pie"], ["orange", "juice"])]) +def test_vectors_doc_doc_similarity(vocab, text1, text2): + doc1 = get_doc(vocab, text1) + doc2 = get_doc(vocab, text2) + assert doc1.similarity(doc2) == doc2.similarity(doc1) + assert -1. < doc1.similarity(doc2) < 1.0 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 822a0152d..dd52c4cbf 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -238,6 +238,29 @@ cdef class Doc: def doc(self): return self + def char_span(self, int start_idx, int end_idx, label=0, vector=None): + """Create a `Span` object from the slice `doc.text[start : end]`. + + doc (Doc): The parent document. + start (int): The index of the first character of the span. + end (int): The index of the first character after the span. + label (uint64 or string): A label to attach to the Span, e.g. for named entities. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. + RETURNS (Span): The newly constructed object. + """ + if not isinstance(label, int): + label = self.vocab.strings.add(label) + cdef int start = token_by_start(self.c, self.length, start_idx) + if start == -1: + return None + cdef int end = token_by_end(self.c, self.length, end_idx) + if end == -1: + return None + # Currently we have the token index, we want the range-end index + end += 1 + cdef Span span = Span(self, start, end, label=label, vector=vector) + return span + def similarity(self, other): """Make a semantic similarity estimate. The default estimate is cosine similarity using an average of word vectors. diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index 8d675c04f..9645189a5 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -15,5 +15,5 @@ cdef class Span: cdef public _vector cdef public _vector_norm - cpdef int _recalculate_indices(self) except -1 + cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9f2115fe1..7e29cccf4 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -7,7 +7,7 @@ import numpy import numpy.linalg from libc.math cimport sqrt -from .doc cimport token_by_start, token_by_end +from .doc cimport token_by_start, token_by_end, get_token_attr from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t, hash_t from ..attrs cimport attr_id_t @@ -135,6 +135,29 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + cpdef np.ndarray to_array(self, object py_attr_ids): + """Given a list of M attribute IDs, export the tokens to a numpy + `ndarray` of shape `(N, M)`, where `N` is the length of the document. + The values will be 32-bit integers. + + attr_ids (list[int]): A list of attribute ID ints. + RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row + per word, and one column per attribute indicated in the input + `attr_ids`. + """ + cdef int i, j + cdef attr_id_t feature + cdef np.ndarray[attr_t, ndim=2] output + # Make an array from the attributes --- otherwise our inner loop is Python + # dict iteration. + cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) + cdef int length = self.end - self.start + output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64) + for i in range(self.start, self.end): + for j, feature in enumerate(attr_ids): + output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature) + return output + cpdef int _recalculate_indices(self) except -1: if self.end > self.doc.length \ or self.doc.c[self.start].idx != self.start_char \ diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 5b8c276d8..7b11d6efa 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -62,18 +62,26 @@ cdef class Token: def __richcmp__(self, Token other, int op): # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html + cdef Doc my_doc = self.doc + cdef Doc other_doc = other.doc my = self.idx their = other.idx if other is not None else None if op == 0: return my < their elif op == 2: - return my == their + if my_doc is other_doc: + return my == their + else: + return False elif op == 4: return my > their elif op == 1: return my <= their elif op == 3: - return my != their + if my_doc is other_doc: + return my != their + else: + return True elif op == 5: return my >= their else: diff --git a/spacy/util.py b/spacy/util.py index d83fe3416..645f8b3f7 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -22,7 +22,7 @@ import ujson from .symbols import ORTH from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ -from .compat import copy_array, normalize_string_keys, getattr_ +from .compat import copy_array, normalize_string_keys, getattr_, import_file LANGUAGES = {} @@ -112,15 +112,13 @@ def load_model(name, **overrides): def load_model_from_link(name, **overrides): """Load a model from a shortcut link, or directory in spaCy data path.""" - init_file = get_data_path() / name / '__init__.py' - spec = importlib.util.spec_from_file_location(name, str(init_file)) + path = get_data_path() / name / '__init__.py' try: - cls = importlib.util.module_from_spec(spec) + cls = import_file(name, path) except AttributeError: raise IOError( "Cant' load '%s'. If you're using a shortcut link, make sure it " "points to a valid model package (not just a data directory)." % name) - spec.loader.exec_module(cls) return cls.load(**overrides) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 35d4d17ab..72e30bd2f 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,18 +1,25 @@ +from __future__ import unicode_literals +from libc.stdint cimport int32_t, uint64_t import numpy from collections import OrderedDict import msgpack import msgpack_numpy msgpack_numpy.patch() +cimport numpy as np +from .typedefs cimport attr_t from .strings cimport StringStore from . import util +from .compat import basestring_ cdef class Vectors: '''Store, save and load word vectors.''' cdef public object data cdef readonly StringStore strings - cdef public object key2i + cdef public object key2row + cdef public object keys + cdef public int i def __init__(self, strings, data_or_width): self.strings = StringStore() @@ -21,10 +28,10 @@ cdef class Vectors: dtype='f') else: data = data_or_width + self.i = 0 self.data = data - self.key2i = {} - for i, string in enumerate(strings): - self.key2i[self.strings.add(string)] = i + self.key2row = {} + self.keys = np.ndarray((self.data.shape[0],), dtype='uint64') def __reduce__(self): return (Vectors, (self.strings, self.data)) @@ -32,7 +39,7 @@ cdef class Vectors: def __getitem__(self, key): if isinstance(key, basestring): key = self.strings[key] - i = self.key2i[key] + i = self.key2row[key] if i is None: raise KeyError(key) else: @@ -41,14 +48,36 @@ cdef class Vectors: def __setitem__(self, key, vector): if isinstance(key, basestring): key = self.strings.add(key) - i = self.key2i[key] + i = self.key2row[key] self.data[i] = vector def __iter__(self): yield from self.data def __len__(self): - return len(self.strings) + return self.i + + def __contains__(self, key): + if isinstance(key, basestring_): + key = self.strings[key] + return key in self.key2row + + def add(self, key, vector=None): + if isinstance(key, basestring_): + key = self.strings.add(key) + if key not in self.key2row: + i = self.i + if i >= self.keys.shape[0]: + self.keys.resize((self.keys.shape[0]*2,)) + self.data.resize((self.data.shape[0]*2, self.data.shape[1])) + self.key2row[key] = self.i + self.keys[self.i] = key + self.i += 1 + else: + i = self.key2row[key] + if vector is not None: + self.data[i] = vector + return i def items(self): for i, string in enumerate(self.strings): @@ -61,34 +90,60 @@ cdef class Vectors: def most_similar(self, key): raise NotImplementedError - def to_disk(self, path): - raise NotImplementedError + def to_disk(self, path, **exclude): + serializers = OrderedDict(( + ('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)), + ('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)), + )) + return util.to_disk(path, serializers, exclude) - def from_disk(self, path): - raise NotImplementedError + def from_disk(self, path, **exclude): + def load_keys(path): + if path.exists(): + self.keys = numpy.load(path) + for i, key in enumerate(self.keys): + self.keys[i] = key + self.key2row[key] = i + + def load_vectors(path): + if path.exists(): + self.data = numpy.load(path) + + serializers = OrderedDict(( + ('keys', load_keys), + ('vectors', load_vectors), + )) + util.from_disk(path, serializers, exclude) + return self def to_bytes(self, **exclude): def serialize_weights(): - if hasattr(self.weights, 'to_bytes'): - return self.weights.to_bytes() + if hasattr(self.data, 'to_bytes'): + return self.data.to_bytes() else: - return msgpack.dumps(self.weights) - + return msgpack.dumps(self.data) serializers = OrderedDict(( - ('strings', lambda: self.strings.to_bytes()), - ('weights', serialize_weights) + ('keys', lambda: msgpack.dumps(self.keys)), + ('vectors', serialize_weights) )) return util.to_bytes(serializers, exclude) def from_bytes(self, data, **exclude): def deserialize_weights(b): - if hasattr(self.weights, 'from_bytes'): - self.weights.from_bytes() + if hasattr(self.data, 'from_bytes'): + self.data.from_bytes() else: - self.weights = msgpack.loads(b) + self.data = msgpack.loads(b) + + def load_keys(keys): + self.keys.resize((len(keys),)) + for i, key in enumerate(keys): + self.keys[i] = key + self.key2row[key] = i deserializers = OrderedDict(( - ('strings', lambda b: self.strings.from_bytes(b)), - ('weights', deserialize_weights) + ('keys', lambda b: load_keys(msgpack.loads(b))), + ('vectors', deserialize_weights) )) - return util.from_bytes(deserializers, exclude) + util.from_bytes(data, deserializers, exclude) + return self diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 149317779..dc141552d 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -19,9 +19,10 @@ from .tokens.token cimport Token from .attrs cimport PROB, LANG from .structs cimport SerializedLexemeC -from .compat import copy_reg, pickle +from .compat import copy_reg, pickle, basestring_ from .lemmatizer import Lemmatizer from .attrs import intify_attrs +from .vectors import Vectors from . import util from . import attrs from . import symbols @@ -63,6 +64,7 @@ cdef class Vocab: self.strings.add(name) self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings, tag_map, lemmatizer) + self.vectors = Vectors(self.strings, 300) property lang: def __get__(self): @@ -242,13 +244,15 @@ cdef class Vocab: @property def vectors_length(self): - raise NotImplementedError + return len(self.vectors) - def clear_vectors(self): + def clear_vectors(self, new_dim=None): """Drop the current vector table. Because all vectors must be the same width, you have to call this to change the size of the vectors. """ - raise NotImplementedError + if new_dim is None: + new_dim = self.vectors.data.shape[1] + self.vectors = Vectors(self.strings, new_dim) def get_vector(self, orth): """Retrieve a vector for a word in the vocabulary. @@ -262,7 +266,9 @@ cdef class Vocab: RAISES: If no vectors data is loaded, ValueError is raised. """ - raise NotImplementedError + if isinstance(orth, basestring_): + orth = self.strings.add(orth) + return self.vectors[orth] def set_vector(self, orth, vector): """Set a vector for a word in the vocabulary. @@ -272,15 +278,19 @@ cdef class Vocab: RETURNS: None """ - raise NotImplementedError + if not isinstance(orth, basestring_): + orth = self.strings[orth] + self.vectors.add(orth, vector=vector) def has_vector(self, orth): """Check whether a word has a vector. Returns False if no vectors have been loaded. Words can be looked up by string or int ID.""" - return False + if isinstance(orth, basestring_): + orth = self.strings.add(orth) + return orth in self.vectors - def to_disk(self, path): + def to_disk(self, path, **exclude): """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if @@ -292,8 +302,10 @@ cdef class Vocab: self.strings.to_disk(path / 'strings.json') with (path / 'lexemes.bin').open('wb') as file_: file_.write(self.lexemes_to_bytes()) + if self.vectors is not None: + self.vectors.to_disk(path) - def from_disk(self, path): + def from_disk(self, path, **exclude): """Loads state from a directory. Modifies the object in place and returns it. @@ -305,6 +317,8 @@ cdef class Vocab: self.strings.from_disk(path / 'strings.json') with (path / 'lexemes.bin').open('rb') as file_: self.lexemes_from_bytes(file_.read()) + if self.vectors is not None: + self.vectors.from_disk(path, exclude='strings.json') return self def to_bytes(self, **exclude): @@ -313,9 +327,16 @@ cdef class Vocab: **exclude: Named attributes to prevent from being serialized. RETURNS (bytes): The serialized form of the `Vocab` object. """ + def deserialize_vectors(): + if self.vectors is None: + return None + else: + return self.vectors.to_bytes(exclude='strings.json') + getters = OrderedDict(( ('strings', lambda: self.strings.to_bytes()), ('lexemes', lambda: self.lexemes_to_bytes()), + ('vectors', deserialize_vectors) )) return util.to_bytes(getters, exclude) @@ -326,9 +347,15 @@ cdef class Vocab: **exclude: Named attributes to prevent from being loaded. RETURNS (Vocab): The `Vocab` object. """ + def serialize_vectors(b): + if self.vectors is None: + return None + else: + return self.vectors.from_bytes(b, exclude='strings') setters = OrderedDict(( ('strings', lambda b: self.strings.from_bytes(b)), ('lexemes', lambda b: self.lexemes_from_bytes(b)), + ('vectors', lambda b: serialize_vectors(b)) )) util.from_bytes(bytes_data, setters, exclude) return self diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass index 2c40858a8..46c3e84d9 100644 --- a/website/assets/css/_base/_utilities.sass +++ b/website/assets/css/_base/_utilities.sass @@ -112,6 +112,10 @@ .u-nowrap white-space: nowrap +.u-break.u-break + word-wrap: break-word + white-space: initial + .u-no-border border: none diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index 929985144..7fbbcce97 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -140,6 +140,43 @@ p Get the number of tokens in the document. +cell int +cell The number of tokens in the document. ++h(2, "char_span") Doc.char_span + +tag method + +tag-new(2) + +p Create a #[code Span] object from the slice #[code doc.text[start : end]]. + ++aside-code("Example"). + doc = nlp(u'I like New York') + span = doc.char_span(7, 15, label=u'GPE') + assert span.text == 'New York' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code start] + +cell int + +cell The index of the first character of the span. + + +row + +cell #[code end] + +cell int + +cell The index of the first character after the span. + + +row + +cell #[code label] + +cell uint64 / unicode + +cell A label to attach to the Span, e.g. for named entities. + + +row + +cell #[code vector] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + +cell A meaning representation of the span. + + +footrow + +cell returns + +cell #[code Span] + +cell The newly constructed object. + +h(2, "similarity") Doc.similarity +tag method +tag-model("vectors") @@ -211,12 +248,12 @@ p +table(["Name", "Type", "Description"]) +row +cell #[code attr_ids] - +cell ints + +cell list +cell A list of attribute ID ints. +footrow +cell returns - +cell #[code numpy.ndarray[ndim=2, dtype='int32']] + +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] +cell | The exported attributes as a 2D numpy array, with one row per | token and one column per attribute. @@ -245,7 +282,7 @@ p +row +cell #[code array] - +cell #[code numpy.ndarray[ndim=2, dtype='int32']] + +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] +cell The attribute values to load. +footrow @@ -509,7 +546,7 @@ p +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the document's semantics. +h(2, "vector_norm") Doc.vector_norm diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index 9c26f506c..69665ee9d 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -111,6 +111,14 @@ p +cell - +cell A sequence of unicode objects. + +row + +cell #[code as_tuples] + +cell bool + +cell + | If set to #[code True], inputs should be a sequence of + | #[code (text, context)] tuples. Output will then be a sequence of + | #[code (doc, context)] tuples. Defaults to #[code False]. + +row +cell #[code n_threads] +cell int diff --git a/website/docs/api/lexeme.jade b/website/docs/api/lexeme.jade index a0487be9b..6e3f68493 100644 --- a/website/docs/api/lexeme.jade +++ b/website/docs/api/lexeme.jade @@ -129,7 +129,7 @@ p A real-valued meaning representation. +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the lexeme's semantics. +h(2, "vector_norm") Lexeme.vector_norm diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade index 542336714..2ca2d3ea9 100644 --- a/website/docs/api/span.jade +++ b/website/docs/api/span.jade @@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]]. +row +cell #[code vector] - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A meaning representation of the span. +footrow @@ -145,11 +145,47 @@ p +cell float +cell A scalar similarity score. Higher is more similar. ++h(2, "to_array") Span.to_array + +tag method + +tag-new(2) + +p + | Given a list of #[code M] attribute IDs, export the tokens to a numpy + | #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of + | the document. The values will be 32-bit integers. + ++aside-code("Example"). + from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA + doc = nlp(u'I like New York in Autumn.') + span = doc[2:3] + # All strings mapped to integers, for easy export to numpy + np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code attr_ids] + +cell list + +cell A list of attribute ID ints. + + +footrow + +cell returns + +cell #[code.u-break numpy.ndarray[long, ndim=2]] + +cell + | A feature matrix, with one row per word, and one column per + | attribute indicated in the input #[code attr_ids]. + +h(2, "merge") Span.merge +tag method p Retokenize the document, such that the span is merged into a single token. ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + span = doc[2:3] + span.merge() + assert len(doc) == 6 + assert doc[2].text == 'New York' + +table(["Name", "Type", "Description"]) +row +cell #[code **attributes] @@ -270,7 +306,7 @@ p +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the span's semantics. +h(2, "vector_norm") Span.vector_norm diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade index 87387e09d..db445d09b 100644 --- a/website/docs/api/token.jade +++ b/website/docs/api/token.jade @@ -250,7 +250,7 @@ p A real-valued meaning representation. +table(["Name", "Type", "Description"]) +footrow +cell returns - +cell #[code numpy.ndarray[ndim=1, dtype='float32']] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] +cell A 1D numpy array representing the token's semantics. +h(2, "vector_norm") Span.vector_norm