mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-28 06:31:12 +03:00 
			
		
		
		
	Merge remote-tracking branch 'upstream/develop' into indonesian
This commit is contained in:
		
						commit
						fa544e6c9a
					
				
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -28,7 +28,9 @@ MOD_NAMES = [ | |||
|     'spacy.pipeline', | ||||
|     'spacy.syntax.stateclass', | ||||
|     'spacy.syntax._state', | ||||
|     'spacy.syntax._beam_utils', | ||||
|     'spacy.tokenizer', | ||||
|     'spacy._cfile', | ||||
|     'spacy.syntax.parser', | ||||
|     'spacy.syntax.nn_parser', | ||||
|     'spacy.syntax.beam_parser', | ||||
|  |  | |||
							
								
								
									
										26
									
								
								spacy/_cfile.pxd
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								spacy/_cfile.pxd
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,26 @@ | |||
| from libc.stdio cimport fopen, fclose, fread, fwrite, FILE | ||||
| from cymem.cymem cimport Pool | ||||
| 
 | ||||
| cdef class CFile: | ||||
|     cdef FILE* fp | ||||
|     cdef bint is_open | ||||
|     cdef Pool mem | ||||
|     cdef int size # For compatibility with subclass | ||||
|     cdef int _capacity # For compatibility with subclass | ||||
| 
 | ||||
|     cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 | ||||
| 
 | ||||
|     cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 | ||||
| 
 | ||||
|     cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| cdef class StringCFile(CFile): | ||||
|     cdef unsigned char* data | ||||
|   | ||||
|     cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 | ||||
| 
 | ||||
|     cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 | ||||
|      | ||||
|     cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * | ||||
							
								
								
									
										88
									
								
								spacy/_cfile.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										88
									
								
								spacy/_cfile.pyx
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,88 @@ | |||
| from libc.stdio cimport fopen, fclose, fread, fwrite, FILE | ||||
| from libc.string cimport memcpy | ||||
| 
 | ||||
| 
 | ||||
| cdef class CFile: | ||||
|     def __init__(self, loc, mode, on_open_error=None): | ||||
|         if isinstance(mode, unicode): | ||||
|             mode_str = mode.encode('ascii') | ||||
|         else: | ||||
|             mode_str = mode | ||||
|         if hasattr(loc, 'as_posix'): | ||||
|             loc = loc.as_posix() | ||||
|         self.mem = Pool() | ||||
|         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc | ||||
|         self.fp = fopen(<char*>bytes_loc, mode_str) | ||||
|         if self.fp == NULL: | ||||
|             if on_open_error is not None: | ||||
|                 on_open_error() | ||||
|             else: | ||||
|                 raise IOError("Could not open binary file %s" % bytes_loc) | ||||
|         self.is_open = True | ||||
| 
 | ||||
|     def __dealloc__(self): | ||||
|         if self.is_open: | ||||
|             fclose(self.fp) | ||||
| 
 | ||||
|     def close(self): | ||||
|         fclose(self.fp) | ||||
|         self.is_open = False | ||||
| 
 | ||||
|     cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: | ||||
|         st = fread(dest, elem_size, number, self.fp) | ||||
|         if st != number: | ||||
|             raise IOError | ||||
| 
 | ||||
|     cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: | ||||
|         st = fwrite(src, elem_size, number, self.fp) | ||||
|         if st != number: | ||||
|             raise IOError | ||||
| 
 | ||||
|     cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: | ||||
|         cdef void* dest = mem.alloc(number, elem_size) | ||||
|         self.read_into(dest, number, elem_size) | ||||
|         return dest | ||||
| 
 | ||||
|     def write_unicode(self, unicode value): | ||||
|         cdef bytes py_bytes = value.encode('utf8') | ||||
|         cdef char* chars = <char*>py_bytes | ||||
|         self.write(sizeof(char), len(py_bytes), chars) | ||||
| 
 | ||||
| 
 | ||||
| cdef class StringCFile: | ||||
|     def __init__(self, mode, bytes data=b'', on_open_error=None): | ||||
|         self.mem = Pool() | ||||
|         self.is_open = 'w' in mode | ||||
|         self._capacity = max(len(data), 8) | ||||
|         self.size = len(data) | ||||
|         self.data = <unsigned char*>self.mem.alloc(1, self._capacity) | ||||
|         for i in range(len(data)): | ||||
|             self.data[i] = data[i] | ||||
| 
 | ||||
|     def close(self): | ||||
|         self.is_open = False | ||||
| 
 | ||||
|     def string_data(self): | ||||
|         return (self.data-self.size)[:self.size] | ||||
| 
 | ||||
|     cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: | ||||
|         memcpy(dest, self.data, elem_size * number) | ||||
|         self.data += elem_size * number | ||||
| 
 | ||||
|     cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1: | ||||
|         write_size = number * elem_size | ||||
|         if (self.size + write_size) >= self._capacity: | ||||
|             self._capacity = (self.size + write_size) * 2 | ||||
|             self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity) | ||||
|         memcpy(&self.data[self.size], src, elem_size * number) | ||||
|         self.size += write_size | ||||
| 
 | ||||
|     cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: | ||||
|         cdef void* dest = mem.alloc(number, elem_size) | ||||
|         self.read_into(dest, number, elem_size) | ||||
|         return dest | ||||
| 
 | ||||
|     def write_unicode(self, unicode value): | ||||
|         cdef bytes py_bytes = value.encode('utf8') | ||||
|         cdef char* chars = <char*>py_bytes | ||||
|         self.write(sizeof(char), len(py_bytes), chars) | ||||
							
								
								
									
										122
									
								
								spacy/_ml.py
									
									
									
									
									
								
							
							
						
						
									
										122
									
								
								spacy/_ml.py
									
									
									
									
									
								
							|  | @ -5,10 +5,12 @@ from thinc.neural._classes.hash_embed import HashEmbed | |||
| from thinc.neural.ops import NumpyOps, CupyOps | ||||
| from thinc.neural.util import get_array_module | ||||
| import random | ||||
| import cytoolz | ||||
| 
 | ||||
| from thinc.neural._classes.convolution import ExtractWindow | ||||
| from thinc.neural._classes.static_vectors import StaticVectors | ||||
| from thinc.neural._classes.batchnorm import BatchNorm | ||||
| from thinc.neural._classes.batchnorm import BatchNorm as BN | ||||
| from thinc.neural._classes.layernorm import LayerNorm as LN | ||||
| from thinc.neural._classes.resnet import Residual | ||||
| from thinc.neural import ReLu | ||||
| from thinc.neural._classes.selu import SELU | ||||
|  | @ -19,10 +21,12 @@ from thinc.api import FeatureExtracter, with_getitem | |||
| from thinc.neural.pooling import Pooling, max_pool, mean_pool, sum_pool | ||||
| from thinc.neural._classes.attention import ParametricAttention | ||||
| from thinc.linear.linear import LinearModel | ||||
| from thinc.api import uniqued, wrap | ||||
| from thinc.api import uniqued, wrap, flatten_add_lengths | ||||
| 
 | ||||
| 
 | ||||
| from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE, TAG, DEP | ||||
| from .tokens.doc import Doc | ||||
| from . import util | ||||
| 
 | ||||
| import numpy | ||||
| import io | ||||
|  | @ -53,6 +57,27 @@ def _logistic(X, drop=0.): | |||
|     return Y, logistic_bwd | ||||
| 
 | ||||
| 
 | ||||
| @layerize | ||||
| def add_tuples(X, drop=0.): | ||||
|     """Give inputs of sequence pairs, where each sequence is (vals, length), | ||||
|     sum the values, returning a single sequence. | ||||
| 
 | ||||
|     If input is: | ||||
|     ((vals1, length), (vals2, length) | ||||
|     Output is: | ||||
|     (vals1+vals2, length) | ||||
| 
 | ||||
|     vals are a single tensor for the whole batch. | ||||
|     """ | ||||
|     (vals1, length1), (vals2, length2) = X | ||||
|     assert length1 == length2 | ||||
| 
 | ||||
|     def add_tuples_bwd(dY, sgd=None): | ||||
|         return (dY, dY) | ||||
| 
 | ||||
|     return (vals1+vals2, length), add_tuples_bwd | ||||
| 
 | ||||
| 
 | ||||
| def _zero_init(model): | ||||
|     def _zero_init_impl(self, X, y): | ||||
|         self.W.fill(0) | ||||
|  | @ -61,6 +86,7 @@ def _zero_init(model): | |||
|         model.W.fill(0.) | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| @layerize | ||||
| def _preprocess_doc(docs, drop=0.): | ||||
|     keys = [doc.to_array([LOWER]) for doc in docs] | ||||
|  | @ -72,7 +98,6 @@ def _preprocess_doc(docs, drop=0.): | |||
|     return (keys, vals, lengths), None | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def _init_for_precomputed(W, ops): | ||||
|     if (W**2).sum() != 0.: | ||||
|         return | ||||
|  | @ -80,6 +105,7 @@ def _init_for_precomputed(W, ops): | |||
|     ops.xavier_uniform_init(reshaped) | ||||
|     W[:] = reshaped.reshape(W.shape) | ||||
| 
 | ||||
| 
 | ||||
| @describe.on_data(_set_dimensions_if_needed) | ||||
| @describe.attributes( | ||||
|     nI=Dimension("Input size"), | ||||
|  | @ -184,25 +210,36 @@ class PrecomputableMaxouts(Model): | |||
|         return Yfp, backward | ||||
| 
 | ||||
| 
 | ||||
| def drop_layer(layer, factor=2.): | ||||
|     def drop_layer_fwd(X, drop=0.): | ||||
|         drop *= factor | ||||
|         mask = layer.ops.get_dropout_mask((1,), drop) | ||||
|         if mask is None or mask > 0: | ||||
|             return layer.begin_update(X, drop=drop) | ||||
|         else: | ||||
|             return X, lambda dX, sgd=None: dX | ||||
|     return wrap(drop_layer_fwd, layer) | ||||
| 
 | ||||
| 
 | ||||
| def Tok2Vec(width, embed_size, preprocess=None): | ||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] | ||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] | ||||
|     with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add}): | ||||
|         norm = get_col(cols.index(NORM))   >> HashEmbed(width, embed_size, name='embed_lower') | ||||
|         norm = get_col(cols.index(NORM))     >> HashEmbed(width, embed_size, name='embed_lower') | ||||
|         prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size//2, name='embed_prefix') | ||||
|         suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size//2, name='embed_suffix') | ||||
|         shape = get_col(cols.index(SHAPE))   >> HashEmbed(width, embed_size//2, name='embed_shape') | ||||
| 
 | ||||
|         embed = (norm | prefix | suffix | shape ) | ||||
|         embed = (norm | prefix | suffix | shape ) >> LN(Maxout(width, width*4, pieces=3)) | ||||
|         tok2vec = ( | ||||
|             with_flatten( | ||||
|                 asarray(Model.ops, dtype='uint64') | ||||
|                 >> embed | ||||
|                 >> Maxout(width, width*4, pieces=3) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> Maxout(width, width*3)), | ||||
|             pad=4) | ||||
|                 >> uniqued(embed, column=5) | ||||
|                 >> drop_layer( | ||||
|                     Residual( | ||||
|                         (ExtractWindow(nW=1) >> BN(Maxout(width, width*3))) | ||||
|                     ) | ||||
|                 ) ** 4, pad=4 | ||||
|             ) | ||||
|         ) | ||||
|         if preprocess not in (False, None): | ||||
|             tok2vec = preprocess >> tok2vec | ||||
|  | @ -297,7 +334,8 @@ def zero_init(model): | |||
| 
 | ||||
| 
 | ||||
| def doc2feats(cols=None): | ||||
|     cols = [ID, NORM, PREFIX, SUFFIX, SHAPE] | ||||
|     if cols is None: | ||||
|         cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] | ||||
|     def forward(docs, drop=0.): | ||||
|         feats = [] | ||||
|         for doc in docs: | ||||
|  | @ -323,6 +361,37 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.): | |||
|     return vectors, backward | ||||
| 
 | ||||
| 
 | ||||
| def fine_tune(embedding, combine=None): | ||||
|     if combine is not None: | ||||
|         raise NotImplementedError( | ||||
|             "fine_tune currently only supports addition. Set combine=None") | ||||
|     def fine_tune_fwd(docs_tokvecs, drop=0.): | ||||
|         docs, tokvecs = docs_tokvecs | ||||
|         lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i') | ||||
| 
 | ||||
|         vecs, bp_vecs = embedding.begin_update(docs, drop=drop) | ||||
|         flat_tokvecs = embedding.ops.flatten(tokvecs) | ||||
|         flat_vecs = embedding.ops.flatten(vecs) | ||||
|         output = embedding.ops.unflatten( | ||||
|                    (model.mix[0] * flat_vecs + model.mix[1] * flat_tokvecs), | ||||
|                     lengths) | ||||
| 
 | ||||
|         def fine_tune_bwd(d_output, sgd=None): | ||||
|             bp_vecs(d_output, sgd=sgd) | ||||
|             flat_grad = model.ops.flatten(d_output) | ||||
|             model.d_mix[1] += flat_tokvecs.dot(flat_grad.T).sum() | ||||
|             model.d_mix[0] += flat_vecs.dot(flat_grad.T).sum() | ||||
|             if sgd is not None: | ||||
|                 sgd(model._mem.weights, model._mem.gradient, key=model.id) | ||||
|             return d_output | ||||
|         return output, fine_tune_bwd | ||||
|     model = wrap(fine_tune_fwd, embedding) | ||||
|     model.mix = model._mem.add((model.id, 'mix'), (2,)) | ||||
|     model.mix.fill(1.) | ||||
|     model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix')) | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| @layerize | ||||
| def flatten(seqs, drop=0.): | ||||
|     if isinstance(seqs[0], numpy.ndarray): | ||||
|  | @ -369,6 +438,27 @@ def preprocess_doc(docs, drop=0.): | |||
|     vals = ops.allocate(keys.shape[0]) + 1 | ||||
|     return (keys, vals, lengths), None | ||||
| 
 | ||||
| def getitem(i): | ||||
|     def getitem_fwd(X, drop=0.): | ||||
|         return X[i], None | ||||
|     return layerize(getitem_fwd) | ||||
| 
 | ||||
| def build_tagger_model(nr_class, token_vector_width, **cfg): | ||||
|     embed_size = util.env_opt('embed_size', 7500) | ||||
|     with Model.define_operators({'>>': chain, '+': add}): | ||||
|         # Input: (doc, tensor) tuples | ||||
|         private_tok2vec = Tok2Vec(token_vector_width, embed_size, preprocess=doc2feats()) | ||||
| 
 | ||||
|         model = ( | ||||
|             fine_tune(private_tok2vec) | ||||
|             >> with_flatten( | ||||
|                 Maxout(token_vector_width, token_vector_width) | ||||
|                 >> Softmax(nr_class, token_vector_width) | ||||
|             ) | ||||
|         ) | ||||
|     model.nI = None | ||||
|     return model | ||||
| 
 | ||||
| 
 | ||||
| def build_text_classifier(nr_class, width=64, **cfg): | ||||
|     nr_vector = cfg.get('nr_vector', 200) | ||||
|  | @ -383,7 +473,7 @@ def build_text_classifier(nr_class, width=64, **cfg): | |||
|             >> _flatten_add_lengths | ||||
|             >> with_getitem(0, | ||||
|                 uniqued( | ||||
|                   (embed_lower | embed_prefix | embed_suffix | embed_shape)  | ||||
|                   (embed_lower | embed_prefix | embed_suffix | embed_shape) | ||||
|                   >> Maxout(width, width+(width//2)*3)) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) | ||||
|                 >> Residual(ExtractWindow(nW=1) >> ReLu(width, width*3)) | ||||
|  | @ -404,7 +494,7 @@ def build_text_classifier(nr_class, width=64, **cfg): | |||
|             >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) | ||||
|             >> logistic | ||||
|         ) | ||||
|   | ||||
| 
 | ||||
|     model.lsuv = False | ||||
|     return model | ||||
| 
 | ||||
|  |  | |||
|  | @ -3,7 +3,7 @@ | |||
| # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py | ||||
| 
 | ||||
| __title__ = 'spacy-nightly' | ||||
| __version__ = '2.0.0a9' | ||||
| __version__ = '2.0.0a10' | ||||
| __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' | ||||
| __uri__ = 'https://spacy.io' | ||||
| __author__ = 'Explosion AI' | ||||
|  |  | |||
|  | @ -21,10 +21,10 @@ CONVERTERS = { | |||
| @plac.annotations( | ||||
|     input_file=("input file", "positional", None, str), | ||||
|     output_dir=("output directory for converted file", "positional", None, str), | ||||
|     n_sents=("Number of sentences per doc", "option", "n", float), | ||||
|     n_sents=("Number of sentences per doc", "option", "n", int), | ||||
|     morphology=("Enable appending morphology to tags", "flag", "m", bool) | ||||
| ) | ||||
| def convert(cmd, input_file, output_dir, n_sents, morphology): | ||||
| def convert(cmd, input_file, output_dir, n_sents=1, morphology=False): | ||||
|     """ | ||||
|     Convert files into JSON format for use with train command and other | ||||
|     experiment management functions. | ||||
|  |  | |||
|  | @ -91,15 +91,14 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | |||
|                 for batch in minibatch(train_docs, size=batch_sizes): | ||||
|                     docs, golds = zip(*batch) | ||||
|                     nlp.update(docs, golds, sgd=optimizer, | ||||
|                                drop=next(dropout_rates), losses=losses) | ||||
|                                drop=next(dropout_rates), losses=losses, | ||||
|                                update_tensors=True) | ||||
|                     pbar.update(sum(len(doc) for doc in docs)) | ||||
| 
 | ||||
|             with nlp.use_params(optimizer.averages): | ||||
|                 util.set_env_log(False) | ||||
|                 epoch_model_path = output_path / ('model%d' % i) | ||||
|                 nlp.to_disk(epoch_model_path) | ||||
|                 with (output_path / ('model%d.pickle' % i)).open('wb') as file_: | ||||
|                     dill.dump(nlp, file_, -1) | ||||
|                 nlp_loaded = lang_class(pipeline=pipeline) | ||||
|                 nlp_loaded = nlp_loaded.from_disk(epoch_model_path) | ||||
|                 scorer = nlp_loaded.evaluate( | ||||
|  |  | |||
|  | @ -46,19 +46,21 @@ is_osx = sys.platform == 'darwin' | |||
| 
 | ||||
| 
 | ||||
| if is_python2: | ||||
|     import imp | ||||
|     bytes_ = str | ||||
|     unicode_ = unicode | ||||
|     basestring_ = basestring | ||||
|     input_ = raw_input | ||||
|     json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8') | ||||
|     json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False).decode('utf8') | ||||
|     path2str = lambda path: str(path).decode('utf8') | ||||
| 
 | ||||
| elif is_python3: | ||||
|     import importlib.util | ||||
|     bytes_ = bytes | ||||
|     unicode_ = str | ||||
|     basestring_ = str | ||||
|     input_ = input | ||||
|     json_dumps = lambda data: ujson.dumps(data, indent=2) | ||||
|     json_dumps = lambda data: ujson.dumps(data, indent=2, escape_forward_slashes=False) | ||||
|     path2str = lambda path: str(path) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -102,3 +104,12 @@ def normalize_string_keys(old): | |||
|     return new | ||||
| 
 | ||||
| 
 | ||||
| def import_file(name, loc): | ||||
|     loc = str(loc) | ||||
|     if is_python2: | ||||
|         return imp.load_source(name, loc) | ||||
|     else: | ||||
|         spec = importlib.util.spec_from_file_location(name, str(loc)) | ||||
|         module = importlib.util.module_from_spec(spec) | ||||
|         spec.loader.exec_module(module) | ||||
|         return module | ||||
|  |  | |||
							
								
								
									
										18
									
								
								spacy/lang/da/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/da/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.da.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple overvejer at købe et britisk statup for 1 milliard dollar", | ||||
|     "Selvkørende biler flytter forsikringsansvaret over på producenterne", | ||||
|     "San Francisco overvejer at forbyde leverandørrobotter på fortov", | ||||
|     "London er en stor by i Storbritannien" | ||||
| ] | ||||
							
								
								
									
										22
									
								
								spacy/lang/de/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								spacy/lang/de/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,22 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.de.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen", | ||||
|     "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz", | ||||
|     "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz", | ||||
|     "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion", | ||||
|     "San Francisco erwägt Verbot von Lieferrobotern", | ||||
|     "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller", | ||||
|     "Wo bist du?", | ||||
|     "Was ist die Hauptstadt von Deutschland?" | ||||
| ] | ||||
							
								
								
									
										22
									
								
								spacy/lang/en/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								spacy/lang/en/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,22 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.en.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple is looking at buying U.K. startup for $1 billion", | ||||
|     "Autonomous cars shift insurance liability toward manufacturers", | ||||
|     "San Francisco considers banning sidewalk delivery robots", | ||||
|     "London is a big city in the United Kingdom.", | ||||
|     "Where are you?", | ||||
|     "Who is the president of France?", | ||||
|     "What is the capital of the United States?", | ||||
|     "When was Barack Obama born?" | ||||
| ] | ||||
							
								
								
									
										22
									
								
								spacy/lang/es/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								spacy/lang/es/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,22 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.es.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares", | ||||
|     "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes", | ||||
|     "San Francisco analiza prohibir los robots delivery", | ||||
|     "Londres es una gran ciudad del Reino Unido", | ||||
|     "El gato come pescado", | ||||
|     "Veo al hombre con el telescopio", | ||||
|     "La araña come moscas", | ||||
|     "El pingüino incuba en su nido" | ||||
| ] | ||||
							
								
								
									
										26
									
								
								spacy/lang/fr/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								spacy/lang/fr/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,26 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.fr.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard", | ||||
|     "Les voitures autonomes voient leur assurances décalées vers les constructeurs", | ||||
|     "San Francisco envisage d'interdire les robots coursiers", | ||||
|     "Londres est une grande ville du Royaume-Uni", | ||||
|     "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe", | ||||
|     "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon", | ||||
|     "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule", | ||||
|     "Nouvelles attaques de Trump contre le maire de Londres", | ||||
|     "Où es-tu ?", | ||||
|     "Qui est le président de la France ?", | ||||
|     "Où est la capitale des Etats-Unis ?", | ||||
|     "Quand est né Barack Obama ?" | ||||
| ] | ||||
							
								
								
									
										28
									
								
								spacy/lang/he/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								spacy/lang/he/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,28 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.he.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל', | ||||
|     'רה"מ הודיע כי יחרים טקס בחסותו', | ||||
|     'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100', | ||||
|     'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית', | ||||
|     'סע לשלום, המפתחות בפנים.', | ||||
|     'מלצר, פעמיים טורקי!', | ||||
|     'ואהבת לרעך כמוך.', | ||||
|     'היום נעשה משהו בלתי נשכח.', | ||||
|     'איפה הילד?', | ||||
|     'מיהו נשיא צרפת?', | ||||
|     'מהי בירת ארצות הברית?', | ||||
|     "איך קוראים בעברית לצ'ופצ'יק של הקומקום?", | ||||
|     'מה הייתה הדקה?', | ||||
|     'מי אומר שלום ראשון, זה שעולה או זה שיורד?' | ||||
| ] | ||||
							
								
								
									
										18
									
								
								spacy/lang/it/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/it/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.it.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari", | ||||
|     "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori", | ||||
|     "San Francisco prevede di bandire i robot di consegna porta a porta", | ||||
|     "Londra è una grande città del Regno Unito." | ||||
| ] | ||||
							
								
								
									
										18
									
								
								spacy/lang/nb/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/nb/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.nb.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar", | ||||
|     "Selvkjørende biler flytter forsikringsansvaret over på produsentene ", | ||||
|     "San Francisco vurderer å forby robotbud på fortauene", | ||||
|     "London er en stor by i Storbritannia." | ||||
| ] | ||||
							
								
								
									
										20
									
								
								spacy/lang/pl/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								spacy/lang/pl/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,20 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.pl.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Poczuł przyjemną woń mocnej kawy.", | ||||
|     "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.", | ||||
|     "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.", | ||||
|     "Nowy abonament pod lupą Komisji Europejskiej", | ||||
|     "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?", | ||||
|     "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”." | ||||
| ] | ||||
							
								
								
									
										18
									
								
								spacy/lang/pt/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/pt/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.pt.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares", | ||||
|     "Carros autônomos empurram a responsabilidade do seguro para os fabricantes." | ||||
|     "São Francisco considera banir os robôs de entrega que andam pelas calçadas", | ||||
|     "Londres é a maior cidade do Reino Unido" | ||||
| ] | ||||
							
								
								
									
										18
									
								
								spacy/lang/sv/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/sv/examples.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,18 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| 
 | ||||
| """ | ||||
| Example sentences to test spaCy and its language models. | ||||
| 
 | ||||
| >>> from spacy.lang.sv.examples import sentences | ||||
| >>> docs = nlp.pipe(sentences) | ||||
| """ | ||||
| 
 | ||||
| 
 | ||||
| sentences = [ | ||||
|     "Apple överväger att köpa brittisk startup för 1 miljard dollar.", | ||||
|     "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.", | ||||
|     "San Fransisco överväger förbud mot leveransrobotar på trottoarer.". | ||||
|     "London är en storstad i Storbritannien." | ||||
| ] | ||||
|  | @ -95,7 +95,7 @@ class BaseDefaults(object): | |||
|         meta = nlp.meta if nlp is not None else {} | ||||
|         # Resolve strings, like "cnn", "lstm", etc | ||||
|         pipeline = [] | ||||
|         for entry in cls.pipeline: | ||||
|         for entry in meta.get('pipeline', []): | ||||
|             if entry in disable or getattr(entry, 'name', entry) in disable: | ||||
|                 continue | ||||
|             factory = cls.Defaults.factories[entry] | ||||
|  | @ -277,7 +277,8 @@ class Language(object): | |||
|     def make_doc(self, text): | ||||
|         return self.tokenizer(text) | ||||
| 
 | ||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None, | ||||
|             update_tensors=False): | ||||
|         """Update the models in the pipeline. | ||||
| 
 | ||||
|         docs (iterable): A batch of `Doc` objects. | ||||
|  | @ -304,14 +305,17 @@ class Language(object): | |||
|             grads[key] = (W, dW) | ||||
|         pipes = list(self.pipeline[1:]) | ||||
|         random.shuffle(pipes) | ||||
|         tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) | ||||
|         all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses] | ||||
|         for proc in pipes: | ||||
|             if not hasattr(proc, 'update'): | ||||
|                 continue | ||||
|             tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) | ||||
|             d_tokvecses = proc.update((docs, tokvecses), golds, | ||||
|                                       drop=drop, sgd=get_grads, losses=losses) | ||||
|             if d_tokvecses is not None: | ||||
|                 bp_tokvecses(d_tokvecses, sgd=sgd) | ||||
|             if update_tensors and d_tokvecses is not None: | ||||
|                 for i, d_tv in enumerate(d_tokvecses): | ||||
|                     all_d_tokvecses[i] += d_tv | ||||
|         bp_tokvecses(all_d_tokvecses, sgd=sgd) | ||||
|         for key, (W, dW) in grads.items(): | ||||
|             sgd(W, dW, key=key) | ||||
|         # Clear the tensor variable, to free GPU memory. | ||||
|  | @ -381,9 +385,18 @@ class Language(object): | |||
|         return optimizer | ||||
| 
 | ||||
|     def evaluate(self, docs_golds): | ||||
|         docs, golds = zip(*docs_golds) | ||||
|         scorer = Scorer() | ||||
|         for doc, gold in zip(self.pipe(docs, batch_size=32), golds): | ||||
|         docs, golds = zip(*docs_golds) | ||||
|         docs = list(docs) | ||||
|         golds = list(golds) | ||||
|         for pipe in self.pipeline: | ||||
|             if not hasattr(pipe, 'pipe'): | ||||
|                 for doc in docs: | ||||
|                     pipe(doc) | ||||
|             else: | ||||
|                 docs = list(pipe.pipe(docs)) | ||||
|         assert len(docs) == len(golds) | ||||
|         for doc, gold in zip(docs, golds): | ||||
|             scorer.score(doc, gold) | ||||
|             doc.tensor = None | ||||
|         return scorer | ||||
|  | @ -417,11 +430,16 @@ class Language(object): | |||
|             except StopIteration: | ||||
|                 pass | ||||
| 
 | ||||
|     def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]): | ||||
|     def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000, | ||||
|             disable=[]): | ||||
|         """Process texts as a stream, and yield `Doc` objects in order. Supports | ||||
|         GIL-free multi-threading. | ||||
| 
 | ||||
|         texts (iterator): A sequence of texts to process. | ||||
|         as_tuples (bool): | ||||
|             If set to True, inputs should be a sequence of | ||||
|             (text, context) tuples. Output will then be a sequence of | ||||
|             (doc, context) tuples. Defaults to False. | ||||
|         n_threads (int): The number of worker threads to use. If -1, OpenMP will | ||||
|             decide how many to use at run time. Default is 2. | ||||
|         batch_size (int): The number of texts to buffer. | ||||
|  | @ -433,7 +451,7 @@ class Language(object): | |||
|             >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4): | ||||
|             >>>         assert doc.is_parsed | ||||
|         """ | ||||
|         if tuples: | ||||
|         if as_tuples: | ||||
|             text_context1, text_context2 = itertools.tee(texts) | ||||
|             texts = (tc[0] for tc in text_context1) | ||||
|             contexts = (tc[1] for tc in text_context2) | ||||
|  |  | |||
|  | @ -42,7 +42,7 @@ from .compat import json_dumps | |||
| 
 | ||||
| from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS | ||||
| from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats | ||||
| from ._ml import build_text_classifier | ||||
| from ._ml import build_text_classifier, build_tagger_model | ||||
| from .parts_of_speech import X | ||||
| 
 | ||||
| 
 | ||||
|  | @ -138,7 +138,7 @@ class TokenVectorEncoder(BaseThincComponent): | |||
|     name = 'tensorizer' | ||||
| 
 | ||||
|     @classmethod | ||||
|     def Model(cls, width=128, embed_size=7500, **cfg): | ||||
|     def Model(cls, width=128, embed_size=4000, **cfg): | ||||
|         """Create a new statistical model for the class. | ||||
| 
 | ||||
|         width (int): Output size of the model. | ||||
|  | @ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent): | |||
|         self.cfg = dict(cfg) | ||||
| 
 | ||||
|     def __call__(self, doc): | ||||
|         tags = self.predict([doc.tensor]) | ||||
|         tags = self.predict(([doc], [doc.tensor])) | ||||
|         self.set_annotations([doc], tags) | ||||
|         return doc | ||||
| 
 | ||||
|     def pipe(self, stream, batch_size=128, n_threads=-1): | ||||
|         for docs in cytoolz.partition_all(batch_size, stream): | ||||
|             docs = list(docs) | ||||
|             tokvecs = [d.tensor for d in docs] | ||||
|             tag_ids = self.predict(tokvecs) | ||||
|             tag_ids = self.predict((docs, tokvecs)) | ||||
|             self.set_annotations(docs, tag_ids) | ||||
|             yield from docs | ||||
| 
 | ||||
|     def predict(self, tokvecs): | ||||
|         scores = self.model(tokvecs) | ||||
|     def predict(self, docs_tokvecs): | ||||
|         scores = self.model(docs_tokvecs) | ||||
|         scores = self.model.ops.flatten(scores) | ||||
|         guesses = scores.argmax(axis=1) | ||||
|         if not isinstance(guesses, numpy.ndarray): | ||||
|             guesses = guesses.get() | ||||
|         tokvecs = docs_tokvecs[1] | ||||
|         guesses = self.model.ops.unflatten(guesses, | ||||
|                     [tv.shape[0] for tv in tokvecs]) | ||||
|         return guesses | ||||
|  | @ -282,6 +284,8 @@ class NeuralTagger(BaseThincComponent): | |||
|         cdef Vocab vocab = self.vocab | ||||
|         for i, doc in enumerate(docs): | ||||
|             doc_tag_ids = batch_tag_ids[i] | ||||
|             if hasattr(doc_tag_ids, 'get'): | ||||
|                 doc_tag_ids = doc_tag_ids.get() | ||||
|             for j, tag_id in enumerate(doc_tag_ids): | ||||
|                 # Don't clobber preset POS tags | ||||
|                 if doc.c[j].tag == 0 and doc.c[j].pos == 0: | ||||
|  | @ -294,8 +298,7 @@ class NeuralTagger(BaseThincComponent): | |||
| 
 | ||||
|         if self.model.nI is None: | ||||
|             self.model.nI = tokvecs[0].shape[1] | ||||
| 
 | ||||
|         tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop) | ||||
|         tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop) | ||||
|         loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) | ||||
| 
 | ||||
|         d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) | ||||
|  | @ -346,10 +349,8 @@ class NeuralTagger(BaseThincComponent): | |||
| 
 | ||||
|     @classmethod | ||||
|     def Model(cls, n_tags, token_vector_width): | ||||
|         return with_flatten( | ||||
|             chain(Maxout(token_vector_width, token_vector_width), | ||||
|                   Softmax(n_tags, token_vector_width))) | ||||
| 
 | ||||
|         return build_tagger_model(n_tags, token_vector_width) | ||||
|   | ||||
|     def use_params(self, params): | ||||
|         with self.model.use_params(params): | ||||
|             yield | ||||
|  | @ -432,7 +433,7 @@ class NeuralLabeller(NeuralTagger): | |||
| 
 | ||||
|     @property | ||||
|     def labels(self): | ||||
|         return self.cfg.get('labels', {}) | ||||
|         return self.cfg.setdefault('labels', {}) | ||||
| 
 | ||||
|     @labels.setter | ||||
|     def labels(self, value): | ||||
|  | @ -455,10 +456,8 @@ class NeuralLabeller(NeuralTagger): | |||
| 
 | ||||
|     @classmethod | ||||
|     def Model(cls, n_tags, token_vector_width): | ||||
|         return with_flatten( | ||||
|             chain(Maxout(token_vector_width, token_vector_width), | ||||
|                   Softmax(n_tags, token_vector_width))) | ||||
| 
 | ||||
|         return build_tagger_model(n_tags, token_vector_width) | ||||
|      | ||||
|     def get_loss(self, docs, golds, scores): | ||||
|         scores = self.model.ops.flatten(scores) | ||||
|         cdef int idx = 0 | ||||
|  |  | |||
|  | @ -215,7 +215,10 @@ cdef class StringStore: | |||
|         path = util.ensure_path(path) | ||||
|         with path.open('r') as file_: | ||||
|             strings = ujson.load(file_) | ||||
|         prev = list(self) | ||||
|         self._reset_and_load(strings) | ||||
|         for word in prev: | ||||
|             self.add(word) | ||||
|         return self | ||||
| 
 | ||||
|     def to_bytes(self, **exclude): | ||||
|  | @ -234,7 +237,10 @@ cdef class StringStore: | |||
|         RETURNS (StringStore): The `StringStore` object. | ||||
|         """ | ||||
|         strings = ujson.loads(bytes_data) | ||||
|         prev = list(self) | ||||
|         self._reset_and_load(strings) | ||||
|         for word in prev: | ||||
|             self.add(word) | ||||
|         return self | ||||
| 
 | ||||
|     def set_frozen(self, bint is_frozen): | ||||
|  |  | |||
							
								
								
									
										286
									
								
								spacy/syntax/_beam_utils.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										286
									
								
								spacy/syntax/_beam_utils.pyx
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,286 @@ | |||
| # cython: infer_types=True | ||||
| # cython: profile=True | ||||
| cimport numpy as np | ||||
| import numpy | ||||
| from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF | ||||
| from thinc.extra.search cimport Beam | ||||
| from thinc.extra.search import MaxViolation | ||||
| from thinc.typedefs cimport hash_t, class_t | ||||
| from thinc.extra.search cimport MaxViolation | ||||
| 
 | ||||
| from .transition_system cimport TransitionSystem, Transition | ||||
| from .stateclass cimport StateClass | ||||
| from ..gold cimport GoldParse | ||||
| from ..tokens.doc cimport Doc | ||||
| 
 | ||||
| 
 | ||||
| # These are passed as callbacks to thinc.search.Beam | ||||
| cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1: | ||||
|     dest = <StateClass>_dest | ||||
|     src = <StateClass>_src | ||||
|     moves = <const Transition*>_moves | ||||
|     dest.clone(src) | ||||
|     moves[clas].do(dest.c, moves[clas].label) | ||||
| 
 | ||||
| 
 | ||||
| cdef int _check_final_state(void* _state, void* extra_args) except -1: | ||||
|     return (<StateClass>_state).is_final() | ||||
| 
 | ||||
| 
 | ||||
| def _cleanup(Beam beam): | ||||
|     for i in range(beam.width): | ||||
|         Py_XDECREF(<PyObject*>beam._states[i].content) | ||||
|         Py_XDECREF(<PyObject*>beam._parents[i].content) | ||||
| 
 | ||||
| 
 | ||||
| cdef hash_t _hash_state(void* _state, void* _) except 0: | ||||
|     state = <StateClass>_state | ||||
|     if state.c.is_final(): | ||||
|         return 1 | ||||
|     else: | ||||
|         return state.c.hash() | ||||
| 
 | ||||
| 
 | ||||
| cdef class ParserBeam(object): | ||||
|     cdef public TransitionSystem moves | ||||
|     cdef public object states | ||||
|     cdef public object golds | ||||
|     cdef public object beams | ||||
|     cdef public object dones | ||||
| 
 | ||||
|     def __init__(self, TransitionSystem moves, states, golds, | ||||
|             int width, float density): | ||||
|         self.moves = moves | ||||
|         self.states = states | ||||
|         self.golds = golds | ||||
|         self.beams = [] | ||||
|         cdef Beam beam | ||||
|         cdef StateClass state, st | ||||
|         for state in states: | ||||
|             beam = Beam(self.moves.n_moves, width, density) | ||||
|             beam.initialize(self.moves.init_beam_state, state.c.length, state.c._sent) | ||||
|             for i in range(beam.width): | ||||
|                 st = <StateClass>beam.at(i) | ||||
|                 st.c.offset = state.c.offset | ||||
|             self.beams.append(beam) | ||||
|         self.dones = [False] * len(self.beams) | ||||
| 
 | ||||
|     def __dealloc__(self): | ||||
|         if self.beams is not None: | ||||
|             for beam in self.beams: | ||||
|                 if beam is not None: | ||||
|                     _cleanup(beam) | ||||
| 
 | ||||
|     @property | ||||
|     def is_done(self): | ||||
|         return all(b.is_done or self.dones[i] for i, b in enumerate(self.beams)) | ||||
| 
 | ||||
|     def __getitem__(self, i): | ||||
|         return self.beams[i] | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         return len(self.beams) | ||||
| 
 | ||||
|     def advance(self, scores, follow_gold=False): | ||||
|         cdef Beam beam | ||||
|         for i, beam in enumerate(self.beams): | ||||
|             if beam.is_done or not scores[i].size or self.dones[i]: | ||||
|                 continue | ||||
|             self._set_scores(beam, scores[i]) | ||||
|             if self.golds is not None: | ||||
|                 self._set_costs(beam, self.golds[i], follow_gold=follow_gold) | ||||
|             if follow_gold: | ||||
|                 beam.advance(_transition_state, NULL, <void*>self.moves.c) | ||||
|             else: | ||||
|                 beam.advance(_transition_state, _hash_state, <void*>self.moves.c) | ||||
|             beam.check_done(_check_final_state, NULL) | ||||
|             if beam.is_done and self.golds is not None: | ||||
|                 for j in range(beam.size): | ||||
|                     state = <StateClass>beam.at(j) | ||||
|                     if state.is_final(): | ||||
|                         try: | ||||
|                             if self.moves.is_gold_parse(state, self.golds[i]): | ||||
|                                 beam._states[j].loss = 0.0 | ||||
|                             elif beam._states[j].loss == 0.0: | ||||
|                                 beam._states[j].loss = 1.0 | ||||
|                         except NotImplementedError: | ||||
|                             break | ||||
| 
 | ||||
|     def _set_scores(self, Beam beam, float[:, ::1] scores): | ||||
|         cdef float* c_scores = &scores[0, 0] | ||||
|         cdef int nr_state = min(scores.shape[0], beam.size) | ||||
|         cdef int nr_class = scores.shape[1] | ||||
|         for i in range(nr_state): | ||||
|             state = <StateClass>beam.at(i) | ||||
|             if not state.is_final(): | ||||
|                 for j in range(nr_class): | ||||
|                     beam.scores[i][j] = c_scores[i * nr_class + j] | ||||
|                 self.moves.set_valid(beam.is_valid[i], state.c) | ||||
|             else: | ||||
|                 for j in range(beam.nr_class): | ||||
|                     beam.scores[i][j] = 0 | ||||
|                     beam.costs[i][j] = 0 | ||||
| 
 | ||||
|     def _set_costs(self, Beam beam, GoldParse gold, int follow_gold=False): | ||||
|         for i in range(beam.size): | ||||
|             state = <StateClass>beam.at(i) | ||||
|             if not state.c.is_final(): | ||||
|                 self.moves.set_costs(beam.is_valid[i], beam.costs[i], state, gold) | ||||
|                 if follow_gold: | ||||
|                     for j in range(beam.nr_class): | ||||
|                         if beam.costs[i][j] >= 1: | ||||
|                             beam.is_valid[i][j] = 0 | ||||
| 
 | ||||
| 
 | ||||
| def get_token_ids(states, int n_tokens): | ||||
|     cdef StateClass state | ||||
|     cdef np.ndarray ids = numpy.zeros((len(states), n_tokens), | ||||
|                                       dtype='int32', order='C') | ||||
|     c_ids = <int*>ids.data | ||||
|     for i, state in enumerate(states): | ||||
|         if not state.is_final(): | ||||
|             state.c.set_context_tokens(c_ids, n_tokens) | ||||
|         else: | ||||
|             ids[i] = -1 | ||||
|         c_ids += ids.shape[1] | ||||
|     return ids | ||||
| 
 | ||||
| nr_update = 0 | ||||
| def update_beam(TransitionSystem moves, int nr_feature, int max_steps, | ||||
|                 states, tokvecs, golds, | ||||
|                 state2vec, vec2scores,  | ||||
|                 int width, float density, | ||||
|                 sgd=None, losses=None, drop=0.): | ||||
|     global nr_update | ||||
|     cdef MaxViolation violn | ||||
|     nr_update += 1 | ||||
|     pbeam = ParserBeam(moves, states, golds, | ||||
|                        width=width, density=density) | ||||
|     gbeam = ParserBeam(moves, states, golds, | ||||
|                        width=width, density=0.0) | ||||
|     cdef StateClass state | ||||
|     beam_maps = [] | ||||
|     backprops = [] | ||||
|     violns = [MaxViolation() for _ in range(len(states))] | ||||
|     for t in range(max_steps): | ||||
|         if pbeam.is_done and gbeam.is_done: | ||||
|             break | ||||
|         # The beam maps let us find the right row in the flattened scores | ||||
|         # arrays for each state. States are identified by (example id, history). | ||||
|         # We keep a different beam map for each step (since we'll have a flat | ||||
|         # scores array for each step). The beam map will let us take the per-state | ||||
|         # losses, and compute the gradient for each (step, state, class). | ||||
|         beam_maps.append({}) | ||||
|         # Gather all states from the two beams in a list. Some stats may occur | ||||
|         # in both beams. To figure out which beam each state belonged to, | ||||
|         # we keep two lists of indices, p_indices and g_indices | ||||
|         states, p_indices, g_indices = get_states(pbeam, gbeam, beam_maps[-1], nr_update) | ||||
|         if not states: | ||||
|             break | ||||
|         # Now that we have our flat list of states, feed them through the model | ||||
|         token_ids = get_token_ids(states, nr_feature) | ||||
|         vectors, bp_vectors = state2vec.begin_update(token_ids, drop=drop) | ||||
|         scores, bp_scores = vec2scores.begin_update(vectors, drop=drop) | ||||
| 
 | ||||
|         # Store the callbacks for the backward pass | ||||
|         backprops.append((token_ids, bp_vectors, bp_scores)) | ||||
| 
 | ||||
|         # Unpack the flat scores into lists for the two beams. The indices arrays | ||||
|         # tell us which example and state the scores-row refers to. | ||||
|         p_scores = [numpy.ascontiguousarray(scores[indices], dtype='f') for indices in p_indices] | ||||
|         g_scores = [numpy.ascontiguousarray(scores[indices], dtype='f')  for indices in g_indices] | ||||
|         # Now advance the states in the beams. The gold beam is contrained to | ||||
|         # to follow only gold analyses. | ||||
|         pbeam.advance(p_scores) | ||||
|         gbeam.advance(g_scores, follow_gold=True) | ||||
|         # Track the "maximum violation", to use in the update. | ||||
|         for i, violn in enumerate(violns): | ||||
|             violn.check_crf(pbeam[i], gbeam[i]) | ||||
|     histories = [] | ||||
|     losses = [] | ||||
|     for violn in violns: | ||||
|         if violn.p_hist: | ||||
|             histories.append(violn.p_hist + violn.g_hist) | ||||
|             losses.append(violn.p_probs + violn.g_probs) | ||||
|         else: | ||||
|             histories.append([]) | ||||
|             losses.append([]) | ||||
|     states_d_scores = get_gradient(moves.n_moves, beam_maps, histories, losses) | ||||
|     return states_d_scores, backprops[:len(states_d_scores)] | ||||
| 
 | ||||
| 
 | ||||
| def get_states(pbeams, gbeams, beam_map, nr_update): | ||||
|     seen = {} | ||||
|     states = [] | ||||
|     p_indices = [] | ||||
|     g_indices = [] | ||||
|     cdef Beam pbeam, gbeam | ||||
|     assert len(pbeams) == len(gbeams) | ||||
|     for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)): | ||||
|         p_indices.append([]) | ||||
|         g_indices.append([]) | ||||
|         for i in range(pbeam.size): | ||||
|             state = <StateClass>pbeam.at(i) | ||||
|             if not state.is_final(): | ||||
|                 key = tuple([eg_id] + pbeam.histories[i]) | ||||
|                 assert key not in seen, (key, seen) | ||||
|                 seen[key] = len(states) | ||||
|                 p_indices[-1].append(len(states)) | ||||
|                 states.append(state) | ||||
|         beam_map.update(seen) | ||||
|         for i in range(gbeam.size): | ||||
|             state = <StateClass>gbeam.at(i) | ||||
|             if not state.is_final(): | ||||
|                 key = tuple([eg_id] + gbeam.histories[i]) | ||||
|                 if key in seen: | ||||
|                     g_indices[-1].append(seen[key]) | ||||
|                 else: | ||||
|                     g_indices[-1].append(len(states)) | ||||
|                     beam_map[key] = len(states) | ||||
|                     states.append(state) | ||||
|     p_idx = [numpy.asarray(idx, dtype='i') for idx in p_indices] | ||||
|     g_idx = [numpy.asarray(idx, dtype='i') for idx in g_indices] | ||||
|     return states, p_idx, g_idx | ||||
| 
 | ||||
| 
 | ||||
| def get_gradient(nr_class, beam_maps, histories, losses): | ||||
|     """ | ||||
|     The global model assigns a loss to each parse. The beam scores | ||||
|     are additive, so the same gradient is applied to each action | ||||
|     in the history. This gives the gradient of a single *action* | ||||
|     for a beam state -- so we have "the gradient of loss for taking | ||||
|     action i given history H." | ||||
| 
 | ||||
|     Histories: Each hitory is a list of actions | ||||
|     Each candidate has a history | ||||
|     Each beam has multiple candidates | ||||
|     Each batch has multiple beams | ||||
|     So history is list of lists of lists of ints | ||||
|     """ | ||||
|     nr_step = len(beam_maps) | ||||
|     grads = [] | ||||
|     nr_step = 0 | ||||
|     for eg_id, hists in enumerate(histories): | ||||
|         for loss, hist in zip(losses[eg_id], hists): | ||||
|             if loss != 0.0 and not numpy.isnan(loss): | ||||
|                 nr_step = max(nr_step, len(hist)) | ||||
|     for i in range(nr_step): | ||||
|         grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), dtype='f')) | ||||
|     assert len(histories) == len(losses) | ||||
|     for eg_id, hists in enumerate(histories): | ||||
|         for loss, hist in zip(losses[eg_id], hists): | ||||
|             if loss == 0.0 or numpy.isnan(loss): | ||||
|                 continue | ||||
|             key = tuple([eg_id]) | ||||
|             # Adjust loss for length | ||||
|             avg_loss = loss / len(hist) | ||||
|             loss += avg_loss * (nr_step - len(hist)) | ||||
|             for j, clas in enumerate(hist): | ||||
|                 i = beam_maps[j][key] | ||||
|                 # In step j, at state i action clas | ||||
|                 # resulted in loss | ||||
|                 grads[j][i, clas] += loss | ||||
|                 key = key + tuple([clas]) | ||||
|     return grads | ||||
| 
 | ||||
| 
 | ||||
|  | @ -37,6 +37,7 @@ cdef cppclass StateC: | |||
|         this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint)) | ||||
|         this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC)) | ||||
|         this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity)) | ||||
|         this.offset = 0 | ||||
|         cdef int i | ||||
|         for i in range(length + (PADDING * 2)): | ||||
|             this._ents[i].end = -1 | ||||
|  | @ -73,7 +74,16 @@ cdef cppclass StateC: | |||
|         free(this.shifted - PADDING) | ||||
| 
 | ||||
|     void set_context_tokens(int* ids, int n) nogil: | ||||
|         if n == 13: | ||||
|         if n == 8: | ||||
|             ids[0] = this.B(0) | ||||
|             ids[1] = this.B(1) | ||||
|             ids[2] = this.S(0) | ||||
|             ids[3] = this.S(1) | ||||
|             ids[4] = this.H(this.S(0)) | ||||
|             ids[5] = this.L(this.B(0), 1) | ||||
|             ids[6] = this.L(this.S(0), 2) | ||||
|             ids[7] = this.R(this.S(0), 1) | ||||
|         elif n == 13: | ||||
|             ids[0] = this.B(0) | ||||
|             ids[1] = this.B(1) | ||||
|             ids[2] = this.S(0) | ||||
|  |  | |||
|  | @ -351,6 +351,20 @@ cdef class ArcEager(TransitionSystem): | |||
|         def __get__(self): | ||||
|             return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) | ||||
| 
 | ||||
|     def is_gold_parse(self, StateClass state, GoldParse gold): | ||||
|         predicted = set() | ||||
|         truth = set() | ||||
|         for i in range(gold.length): | ||||
|             if gold.cand_to_gold[i] is None: | ||||
|                 continue | ||||
|             if state.safe_get(i).dep: | ||||
|                 predicted.add((i, state.H(i), self.strings[state.safe_get(i).dep])) | ||||
|             else: | ||||
|                 predicted.add((i, state.H(i), 'ROOT')) | ||||
|             id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] | ||||
|             truth.add((id_, head, dep)) | ||||
|         return truth == predicted | ||||
| 
 | ||||
|     def has_gold(self, GoldParse gold, start=0, end=None): | ||||
|         end = end or len(gold.heads) | ||||
|         if all([tag is None for tag in gold.heads[start:end]]): | ||||
|  | @ -385,6 +399,7 @@ cdef class ArcEager(TransitionSystem): | |||
|         for i in range(self.n_moves): | ||||
|             if self.c[i].move == move and self.c[i].label == label: | ||||
|                 return self.c[i] | ||||
|         return Transition(clas=0, move=MISSING, label=0) | ||||
| 
 | ||||
|     def move_name(self, int move, attr_t label): | ||||
|         label_str = self.strings[label] | ||||
|  |  | |||
|  | @ -107,7 +107,7 @@ cdef class BeamParser(Parser): | |||
|             # The non-monotonic oracle makes it difficult to ensure final costs are | ||||
|             # correct. Therefore do final correction | ||||
|             for i in range(pred.size): | ||||
|                 if is_gold(<StateClass>pred.at(i), gold_parse, self.moves.strings): | ||||
|                 if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse): | ||||
|                     pred._states[i].loss = 0.0 | ||||
|                 elif pred._states[i].loss == 0.0: | ||||
|                     pred._states[i].loss = 1.0 | ||||
|  | @ -213,7 +213,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio | |||
|         if not pred._states[i].is_done or pred._states[i].loss == 0: | ||||
|             continue | ||||
|         state = <StateClass>pred.at(i) | ||||
|         if is_gold(state, gold_parse, moves.strings) == True: | ||||
|         if moves.is_gold_parse(state, gold_parse) == True: | ||||
|             for dep in gold_parse.orig_annot: | ||||
|                 print(dep[1], dep[3], dep[4]) | ||||
|             print("Cost", pred._states[i].loss) | ||||
|  | @ -227,7 +227,7 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio | |||
|         if not gold._states[i].is_done: | ||||
|             continue | ||||
|         state = <StateClass>gold.at(i) | ||||
|         if is_gold(state, gold_parse, moves.strings) == False: | ||||
|         if moves.is_gold(state, gold_parse) == False: | ||||
|             print("Truth") | ||||
|             for dep in gold_parse.orig_annot: | ||||
|                 print(dep[1], dep[3], dep[4]) | ||||
|  | @ -237,16 +237,3 @@ def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, Transitio | |||
|             raise Exception("Gold parse is not gold-standard") | ||||
| 
 | ||||
| 
 | ||||
| def is_gold(StateClass state, GoldParse gold, StringStore strings): | ||||
|     predicted = set() | ||||
|     truth = set() | ||||
|     for i in range(gold.length): | ||||
|         if gold.cand_to_gold[i] is None: | ||||
|             continue | ||||
|         if state.safe_get(i).dep: | ||||
|             predicted.add((i, state.H(i), strings[state.safe_get(i).dep])) | ||||
|         else: | ||||
|             predicted.add((i, state.H(i), 'ROOT')) | ||||
|         id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] | ||||
|         truth.add((id_, head, dep)) | ||||
|     return truth == predicted | ||||
|  |  | |||
|  | @ -14,8 +14,4 @@ cdef class Parser: | |||
|     cdef readonly TransitionSystem moves | ||||
|     cdef readonly object cfg | ||||
| 
 | ||||
|     cdef void _parse_step(self, StateC* state, | ||||
|             const float* feat_weights, | ||||
|             int nr_class, int nr_feat, int nr_piece) nogil | ||||
| 
 | ||||
|     #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil | ||||
|  |  | |||
|  | @ -37,14 +37,18 @@ from preshed.maps cimport MapStruct | |||
| from preshed.maps cimport map_get | ||||
| 
 | ||||
| from thinc.api import layerize, chain, noop, clone | ||||
| from thinc.neural import Model, Affine, ELU, ReLu, Maxout | ||||
| from thinc.neural import Model, Affine, ReLu, Maxout | ||||
| from thinc.neural._classes.batchnorm import BatchNorm as BN | ||||
| from thinc.neural._classes.selu import SELU | ||||
| from thinc.neural._classes.layernorm import LayerNorm | ||||
| from thinc.neural.ops import NumpyOps, CupyOps | ||||
| from thinc.neural.util import get_array_module | ||||
| 
 | ||||
| from .. import util | ||||
| from ..util import get_async, get_cuda_stream | ||||
| from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts | ||||
| from .._ml import Tok2Vec, doc2feats, rebatch | ||||
| from .._ml import Tok2Vec, doc2feats, rebatch, fine_tune | ||||
| from .._ml import Residual, drop_layer | ||||
| from ..compat import json_dumps | ||||
| 
 | ||||
| from . import _parse_features | ||||
|  | @ -59,8 +63,10 @@ from ..structs cimport TokenC | |||
| from ..tokens.doc cimport Doc | ||||
| from ..strings cimport StringStore | ||||
| from ..gold cimport GoldParse | ||||
| from ..attrs cimport TAG, DEP | ||||
| from ..attrs cimport ID, TAG, DEP, ORTH, NORM, PREFIX, SUFFIX, TAG | ||||
| from . import _beam_utils | ||||
| 
 | ||||
| USE_FINE_TUNE = True | ||||
| 
 | ||||
| def get_templates(*args, **kwargs): | ||||
|     return [] | ||||
|  | @ -232,11 +238,14 @@ cdef class Parser: | |||
|     Base class of the DependencyParser and EntityRecognizer. | ||||
|     """ | ||||
|     @classmethod | ||||
|     def Model(cls, nr_class, token_vector_width=128, hidden_width=128, depth=1, **cfg): | ||||
|     def Model(cls, nr_class, token_vector_width=128, hidden_width=300, depth=1, **cfg): | ||||
|         depth = util.env_opt('parser_hidden_depth', depth) | ||||
|         token_vector_width = util.env_opt('token_vector_width', token_vector_width) | ||||
|         hidden_width = util.env_opt('hidden_width', hidden_width) | ||||
|         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', 2) | ||||
|         embed_size = util.env_opt('embed_size', 4000) | ||||
|         tensors = fine_tune(Tok2Vec(token_vector_width, embed_size, | ||||
|                     preprocess=doc2feats())) | ||||
|         if parser_maxout_pieces == 1: | ||||
|             lower = PrecomputableAffine(hidden_width if depth >= 1 else nr_class, | ||||
|                         nF=cls.nr_feature, | ||||
|  | @ -248,15 +257,10 @@ cdef class Parser: | |||
|                         nI=token_vector_width) | ||||
| 
 | ||||
|         with Model.use_device('cpu'): | ||||
|             if depth == 0: | ||||
|                 upper = chain() | ||||
|                 upper.is_noop = True | ||||
|             else: | ||||
|                 upper = chain( | ||||
|                     clone(Maxout(hidden_width), (depth-1)), | ||||
|                     zero_init(Affine(nr_class, drop_factor=0.0)) | ||||
|                 ) | ||||
|                 upper.is_noop = False | ||||
|             upper = chain( | ||||
|                 clone(Maxout(hidden_width), (depth-1)), | ||||
|                 zero_init(Affine(nr_class, drop_factor=0.0)) | ||||
|             ) | ||||
|         # TODO: This is an unfortunate hack atm! | ||||
|         # Used to set input dimensions in network. | ||||
|         lower.begin_training(lower.ops.allocate((500, token_vector_width))) | ||||
|  | @ -268,7 +272,7 @@ cdef class Parser: | |||
|             'hidden_width': hidden_width, | ||||
|             'maxout_pieces': parser_maxout_pieces | ||||
|         } | ||||
|         return (lower, upper), cfg | ||||
|         return (tensors, lower, upper), cfg | ||||
| 
 | ||||
|     def __init__(self, Vocab vocab, moves=True, model=True, **cfg): | ||||
|         """ | ||||
|  | @ -294,6 +298,10 @@ cdef class Parser: | |||
|             self.moves = self.TransitionSystem(self.vocab.strings, {}) | ||||
|         else: | ||||
|             self.moves = moves | ||||
|         if 'beam_width' not in cfg: | ||||
|             cfg['beam_width'] = util.env_opt('beam_width', 1) | ||||
|         if 'beam_density' not in cfg: | ||||
|             cfg['beam_density'] = util.env_opt('beam_density', 0.0) | ||||
|         self.cfg = cfg | ||||
|         if 'actions' in self.cfg: | ||||
|             for action, labels in self.cfg.get('actions', {}).items(): | ||||
|  | @ -316,7 +324,7 @@ cdef class Parser: | |||
|         if beam_width is None: | ||||
|             beam_width = self.cfg.get('beam_width', 1) | ||||
|         if beam_density is None: | ||||
|             beam_density = self.cfg.get('beam_density', 0.001) | ||||
|             beam_density = self.cfg.get('beam_density', 0.0) | ||||
|         cdef Beam beam | ||||
|         if beam_width == 1: | ||||
|             states = self.parse_batch([doc], [doc.tensor]) | ||||
|  | @ -332,7 +340,7 @@ cdef class Parser: | |||
|             return output | ||||
| 
 | ||||
|     def pipe(self, docs, int batch_size=1000, int n_threads=2, | ||||
|              beam_width=1, beam_density=0.001): | ||||
|              beam_width=None, beam_density=None): | ||||
|         """ | ||||
|         Process a stream of documents. | ||||
| 
 | ||||
|  | @ -344,17 +352,23 @@ cdef class Parser: | |||
|                 The number of threads with which to work on the buffer in parallel. | ||||
|         Yields (Doc): Documents, in order. | ||||
|         """ | ||||
|         cdef StateClass parse_state | ||||
|         if beam_width is None: | ||||
|             beam_width = self.cfg.get('beam_width', 1) | ||||
|         if beam_density is None: | ||||
|             beam_density = self.cfg.get('beam_density', 0.0) | ||||
|         cdef Doc doc | ||||
|         queue = [] | ||||
|         cdef Beam beam | ||||
|         for docs in cytoolz.partition_all(batch_size, docs): | ||||
|             docs = list(docs) | ||||
|             tokvecs = [d.tensor for d in docs] | ||||
|             tokvecs = [doc.tensor for doc in docs] | ||||
|             if beam_width == 1: | ||||
|                 parse_states = self.parse_batch(docs, tokvecs) | ||||
|             else: | ||||
|                 parse_states = self.beam_parse(docs, tokvecs, | ||||
|                                     beam_width=beam_width, beam_density=beam_density) | ||||
|                 beams = self.beam_parse(docs, tokvecs, | ||||
|                             beam_width=beam_width, beam_density=beam_density) | ||||
|                 parse_states = [] | ||||
|                 for beam in beams: | ||||
|                     parse_states.append(<StateClass>beam.at(0)) | ||||
|             self.set_annotations(docs, parse_states) | ||||
|             yield from docs | ||||
| 
 | ||||
|  | @ -369,8 +383,12 @@ cdef class Parser: | |||
|             int nr_class, nr_feat, nr_piece, nr_dim, nr_state | ||||
|         if isinstance(docs, Doc): | ||||
|             docs = [docs] | ||||
|         if isinstance(tokvecses, np.ndarray): | ||||
|             tokvecses = [tokvecses] | ||||
| 
 | ||||
|         tokvecs = self.model[0].ops.flatten(tokvecses) | ||||
|         if USE_FINE_TUNE: | ||||
|             tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) | ||||
| 
 | ||||
|         nr_state = len(docs) | ||||
|         nr_class = self.moves.n_moves | ||||
|  | @ -394,27 +412,20 @@ cdef class Parser: | |||
|         cdef np.ndarray scores | ||||
|         c_token_ids = <int*>token_ids.data | ||||
|         c_is_valid = <int*>is_valid.data | ||||
|         cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) | ||||
|         while not next_step.empty(): | ||||
|             if not has_hidden: | ||||
|                 for i in cython.parallel.prange( | ||||
|                         next_step.size(), num_threads=6, nogil=True): | ||||
|                     self._parse_step(next_step[i], | ||||
|                         feat_weights, nr_class, nr_feat, nr_piece) | ||||
|             else: | ||||
|                 for i in range(next_step.size()): | ||||
|                     st = next_step[i] | ||||
|                     st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) | ||||
|                     self.moves.set_valid(&c_is_valid[i*nr_class], st) | ||||
|                 vectors = state2vec(token_ids[:next_step.size()]) | ||||
|                 scores = vec2scores(vectors) | ||||
|                 c_scores = <float*>scores.data | ||||
|                 for i in range(next_step.size()): | ||||
|                     st = next_step[i] | ||||
|                     guess = arg_max_if_valid( | ||||
|                         &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) | ||||
|                     action = self.moves.c[guess] | ||||
|                     action.do(st, action.label) | ||||
|             for i in range(next_step.size()): | ||||
|                 st = next_step[i] | ||||
|                 st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) | ||||
|                 self.moves.set_valid(&c_is_valid[i*nr_class], st) | ||||
|             vectors = state2vec(token_ids[:next_step.size()]) | ||||
|             scores = vec2scores(vectors) | ||||
|             c_scores = <float*>scores.data | ||||
|             for i in range(next_step.size()): | ||||
|                 st = next_step[i] | ||||
|                 guess = arg_max_if_valid( | ||||
|                     &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) | ||||
|                 action = self.moves.c[guess] | ||||
|                 action.do(st, action.label) | ||||
|             this_step, next_step = next_step, this_step | ||||
|             next_step.clear() | ||||
|             for st in this_step: | ||||
|  | @ -422,18 +433,22 @@ cdef class Parser: | |||
|                     next_step.push_back(st) | ||||
|         return states | ||||
| 
 | ||||
|     def beam_parse(self, docs, tokvecses, int beam_width=8, float beam_density=0.001): | ||||
|     def beam_parse(self, docs, tokvecses, int beam_width=3, float beam_density=0.001): | ||||
|         cdef Beam beam | ||||
|         cdef np.ndarray scores | ||||
|         cdef Doc doc | ||||
|         cdef int nr_class = self.moves.n_moves | ||||
|         cdef StateClass stcls, output | ||||
|         tokvecs = self.model[0].ops.flatten(tokvecses) | ||||
|         if USE_FINE_TUNE: | ||||
|             tokvecs += self.model[0].ops.flatten(self.model[0]((docs, tokvecses))) | ||||
|         cuda_stream = get_cuda_stream() | ||||
|         state2vec, vec2scores = self.get_batch_model(len(docs), tokvecs, | ||||
|                                                      cuda_stream, 0.0) | ||||
|         beams = [] | ||||
|         cdef int offset = 0 | ||||
|         cdef int j = 0 | ||||
|         cdef int k | ||||
|         for doc in docs: | ||||
|             beam = Beam(nr_class, beam_width, min_density=beam_density) | ||||
|             beam.initialize(self.moves.init_beam_state, doc.length, doc.c) | ||||
|  | @ -446,44 +461,32 @@ cdef class Parser: | |||
|                 states = [] | ||||
|                 for i in range(beam.size): | ||||
|                     stcls = <StateClass>beam.at(i) | ||||
|                     states.append(stcls) | ||||
|                     # This way we avoid having to score finalized states | ||||
|                     # We do have to take care to keep indexes aligned, though | ||||
|                     if not stcls.is_final(): | ||||
|                         states.append(stcls) | ||||
|                 token_ids = self.get_token_ids(states) | ||||
|                 vectors = state2vec(token_ids) | ||||
|                 scores = vec2scores(vectors) | ||||
|                 j = 0 | ||||
|                 c_scores = <float*>scores.data | ||||
|                 for i in range(beam.size): | ||||
|                     stcls = <StateClass>beam.at(i) | ||||
|                     if not stcls.is_final(): | ||||
|                         self.moves.set_valid(beam.is_valid[i], stcls.c) | ||||
|                         for j in range(nr_class): | ||||
|                             beam.scores[i][j] = scores[i, j] | ||||
|                         for k in range(nr_class): | ||||
|                             beam.scores[i][k] = c_scores[j * scores.shape[1] + k] | ||||
|                         j += 1 | ||||
|                 beam.advance(_transition_state, _hash_state, <void*>self.moves.c) | ||||
|                 beam.check_done(_check_final_state, NULL) | ||||
|             beams.append(beam) | ||||
|         return beams | ||||
| 
 | ||||
|     cdef void _parse_step(self, StateC* state, | ||||
|             const float* feat_weights, | ||||
|             int nr_class, int nr_feat, int nr_piece) nogil: | ||||
|         '''This only works with no hidden layers -- fast but inaccurate''' | ||||
|         #for i in cython.parallel.prange(next_step.size(), num_threads=4, nogil=True): | ||||
|         #    self._parse_step(next_step[i], feat_weights, nr_class, nr_feat) | ||||
|         token_ids = <int*>calloc(nr_feat, sizeof(int)) | ||||
|         scores = <float*>calloc(nr_class * nr_piece, sizeof(float)) | ||||
|         is_valid = <int*>calloc(nr_class, sizeof(int)) | ||||
| 
 | ||||
|         state.set_context_tokens(token_ids, nr_feat) | ||||
|         sum_state_features(scores, | ||||
|             feat_weights, token_ids, 1, nr_feat, nr_class * nr_piece) | ||||
|         self.moves.set_valid(is_valid, state) | ||||
|         guess = arg_maxout_if_valid(scores, is_valid, nr_class, nr_piece) | ||||
|         action = self.moves.c[guess] | ||||
|         action.do(state, action.label) | ||||
| 
 | ||||
|         free(is_valid) | ||||
|         free(scores) | ||||
|         free(token_ids) | ||||
| 
 | ||||
|     def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): | ||||
|         if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: | ||||
|             return self.update_beam(docs_tokvecs, golds, | ||||
|                     self.cfg['beam_width'], self.cfg['beam_density'], | ||||
|                     drop=drop, sgd=sgd, losses=losses) | ||||
|         if losses is not None and self.name not in losses: | ||||
|             losses[self.name] = 0. | ||||
|         docs, tokvec_lists = docs_tokvecs | ||||
|  | @ -491,6 +494,10 @@ cdef class Parser: | |||
|         if isinstance(docs, Doc) and isinstance(golds, GoldParse): | ||||
|             docs = [docs] | ||||
|             golds = [golds] | ||||
|         if USE_FINE_TUNE: | ||||
|             my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) | ||||
|             my_tokvecs = self.model[0].ops.flatten(my_tokvecs) | ||||
|             tokvecs += my_tokvecs | ||||
| 
 | ||||
|         cuda_stream = get_cuda_stream() | ||||
| 
 | ||||
|  | @ -517,13 +524,14 @@ cdef class Parser: | |||
|             scores, bp_scores = vec2scores.begin_update(vector, drop=drop) | ||||
| 
 | ||||
|             d_scores = self.get_batch_loss(states, golds, scores) | ||||
|             d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd) | ||||
|             d_scores /= len(docs) | ||||
|             d_vector = bp_scores(d_scores, sgd=sgd) | ||||
|             if drop != 0: | ||||
|                 d_vector *= mask | ||||
| 
 | ||||
|             if isinstance(self.model[0].ops, CupyOps) \ | ||||
|             and not isinstance(token_ids, state2vec.ops.xp.ndarray): | ||||
|                 # Move token_ids and d_vector to CPU, asynchronously | ||||
|                 # Move token_ids and d_vector to GPU, asynchronously | ||||
|                 backprops.append(( | ||||
|                     get_async(cuda_stream, token_ids), | ||||
|                     get_async(cuda_stream, d_vector), | ||||
|  | @ -540,7 +548,62 @@ cdef class Parser: | |||
|                 break | ||||
|         self._make_updates(d_tokvecs, | ||||
|             backprops, sgd, cuda_stream) | ||||
|         return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) | ||||
|         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) | ||||
|         if USE_FINE_TUNE: | ||||
|             bp_my_tokvecs(d_tokvecs, sgd=sgd) | ||||
|         return d_tokvecs | ||||
| 
 | ||||
|     def update_beam(self, docs_tokvecs, golds, width=None, density=None, | ||||
|             drop=0., sgd=None, losses=None): | ||||
|         if width is None: | ||||
|             width = self.cfg.get('beam_width', 2) | ||||
|         if density is None: | ||||
|             density = self.cfg.get('beam_density', 0.0) | ||||
|         if losses is not None and self.name not in losses: | ||||
|             losses[self.name] = 0. | ||||
|         docs, tokvecs = docs_tokvecs | ||||
|         lengths = [len(d) for d in docs] | ||||
|         assert min(lengths) >= 1 | ||||
|         tokvecs = self.model[0].ops.flatten(tokvecs) | ||||
|         if USE_FINE_TUNE: | ||||
|             my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) | ||||
|             my_tokvecs = self.model[0].ops.flatten(my_tokvecs) | ||||
|             tokvecs += my_tokvecs | ||||
| 
 | ||||
|         states = self.moves.init_batch(docs) | ||||
|         for gold in golds: | ||||
|             self.moves.preprocess_gold(gold) | ||||
| 
 | ||||
|         cuda_stream = get_cuda_stream() | ||||
|         state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, 0.0) | ||||
| 
 | ||||
|         states_d_scores, backprops = _beam_utils.update_beam(self.moves, self.nr_feature, 500, | ||||
|                                         states, tokvecs, golds, | ||||
|                                         state2vec, vec2scores, | ||||
|                                         width, density, | ||||
|                                         sgd=sgd, drop=drop, losses=losses) | ||||
|         backprop_lower = [] | ||||
|         cdef float batch_size = len(docs) | ||||
|         for i, d_scores in enumerate(states_d_scores): | ||||
|             d_scores /= batch_size | ||||
|             if losses is not None: | ||||
|                 losses[self.name] += (d_scores**2).sum() | ||||
|             ids, bp_vectors, bp_scores = backprops[i] | ||||
|             d_vector = bp_scores(d_scores, sgd=sgd) | ||||
|             if isinstance(self.model[0].ops, CupyOps) \ | ||||
|             and not isinstance(ids, state2vec.ops.xp.ndarray): | ||||
|                 backprop_lower.append(( | ||||
|                     get_async(cuda_stream, ids), | ||||
|                     get_async(cuda_stream, d_vector), | ||||
|                     bp_vectors)) | ||||
|             else: | ||||
|                 backprop_lower.append((ids, d_vector, bp_vectors)) | ||||
|         d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) | ||||
|         self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) | ||||
|         d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths) | ||||
|         if USE_FINE_TUNE: | ||||
|             bp_my_tokvecs(d_tokvecs, sgd=sgd) | ||||
|         return d_tokvecs | ||||
| 
 | ||||
|     def _init_gold_batch(self, whole_docs, whole_golds): | ||||
|         """Make a square batch, of length equal to the shortest doc. A long | ||||
|  | @ -585,14 +648,10 @@ cdef class Parser: | |||
|         xp = get_array_module(d_tokvecs) | ||||
|         for ids, d_vector, bp_vector in backprops: | ||||
|             d_state_features = bp_vector(d_vector, sgd=sgd) | ||||
|             active_feats = ids * (ids >= 0) | ||||
|             active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1)) | ||||
|             if hasattr(xp, 'scatter_add'): | ||||
|                 xp.scatter_add(d_tokvecs, | ||||
|                     ids, d_state_features * active_feats) | ||||
|             else: | ||||
|                 xp.add.at(d_tokvecs, | ||||
|                     ids, d_state_features * active_feats) | ||||
|             mask = ids >= 0 | ||||
|             d_state_features *= mask.reshape(ids.shape + (1,)) | ||||
|             self.model[0].ops.scatter_add(d_tokvecs, ids * mask, | ||||
|                 d_state_features) | ||||
| 
 | ||||
|     @property | ||||
|     def move_names(self): | ||||
|  | @ -603,12 +662,12 @@ cdef class Parser: | |||
|         return names | ||||
| 
 | ||||
|     def get_batch_model(self, batch_size, tokvecs, stream, dropout): | ||||
|         lower, upper = self.model | ||||
|         _, lower, upper = self.model | ||||
|         state2vec = precompute_hiddens(batch_size, tokvecs, | ||||
|                         lower, stream, drop=dropout) | ||||
|         return state2vec, upper | ||||
| 
 | ||||
|     nr_feature = 13 | ||||
|     nr_feature = 8 | ||||
| 
 | ||||
|     def get_token_ids(self, states): | ||||
|         cdef StateClass state | ||||
|  | @ -693,10 +752,12 @@ cdef class Parser: | |||
| 
 | ||||
|     def to_disk(self, path, **exclude): | ||||
|         serializers = { | ||||
|             'lower_model': lambda p: p.open('wb').write( | ||||
|             'tok2vec_model': lambda p: p.open('wb').write( | ||||
|                 self.model[0].to_bytes()), | ||||
|             'upper_model': lambda p: p.open('wb').write( | ||||
|             'lower_model': lambda p: p.open('wb').write( | ||||
|                 self.model[1].to_bytes()), | ||||
|             'upper_model': lambda p: p.open('wb').write( | ||||
|                 self.model[2].to_bytes()), | ||||
|             'vocab': lambda p: self.vocab.to_disk(p), | ||||
|             'moves': lambda p: self.moves.to_disk(p, strings=False), | ||||
|             'cfg': lambda p: p.open('w').write(json_dumps(self.cfg)) | ||||
|  | @ -717,24 +778,29 @@ cdef class Parser: | |||
|                 self.model, cfg = self.Model(**self.cfg) | ||||
|             else: | ||||
|                 cfg = {} | ||||
|             with (path / 'lower_model').open('rb') as file_: | ||||
|             with (path / 'tok2vec_model').open('rb') as file_: | ||||
|                 bytes_data = file_.read() | ||||
|             self.model[0].from_bytes(bytes_data) | ||||
|             with (path / 'upper_model').open('rb') as file_: | ||||
|             with (path / 'lower_model').open('rb') as file_: | ||||
|                 bytes_data = file_.read() | ||||
|             self.model[1].from_bytes(bytes_data) | ||||
|             with (path / 'upper_model').open('rb') as file_: | ||||
|                 bytes_data = file_.read() | ||||
|             self.model[2].from_bytes(bytes_data) | ||||
|             self.cfg.update(cfg) | ||||
|         return self | ||||
| 
 | ||||
|     def to_bytes(self, **exclude): | ||||
|         serializers = OrderedDict(( | ||||
|             ('lower_model', lambda: self.model[0].to_bytes()), | ||||
|             ('upper_model', lambda: self.model[1].to_bytes()), | ||||
|             ('tok2vec_model', lambda: self.model[0].to_bytes()), | ||||
|             ('lower_model', lambda: self.model[1].to_bytes()), | ||||
|             ('upper_model', lambda: self.model[2].to_bytes()), | ||||
|             ('vocab', lambda: self.vocab.to_bytes()), | ||||
|             ('moves', lambda: self.moves.to_bytes(strings=False)), | ||||
|             ('cfg', lambda: ujson.dumps(self.cfg)) | ||||
|         )) | ||||
|         if 'model' in exclude: | ||||
|             exclude['tok2vec_model'] = True | ||||
|             exclude['lower_model'] = True | ||||
|             exclude['upper_model'] = True | ||||
|             exclude.pop('model') | ||||
|  | @ -745,6 +811,7 @@ cdef class Parser: | |||
|             ('vocab', lambda b: self.vocab.from_bytes(b)), | ||||
|             ('moves', lambda b: self.moves.from_bytes(b, strings=False)), | ||||
|             ('cfg', lambda b: self.cfg.update(ujson.loads(b))), | ||||
|             ('tok2vec_model', lambda b: None), | ||||
|             ('lower_model', lambda b: None), | ||||
|             ('upper_model', lambda b: None) | ||||
|         )) | ||||
|  | @ -754,10 +821,12 @@ cdef class Parser: | |||
|                 self.model, cfg = self.Model(self.moves.n_moves) | ||||
|             else: | ||||
|                 cfg = {} | ||||
|             if 'tok2vec_model' in msg: | ||||
|                 self.model[0].from_bytes(msg['tok2vec_model']) | ||||
|             if 'lower_model' in msg: | ||||
|                 self.model[0].from_bytes(msg['lower_model']) | ||||
|                 self.model[1].from_bytes(msg['lower_model']) | ||||
|             if 'upper_model' in msg: | ||||
|                 self.model[1].from_bytes(msg['upper_model']) | ||||
|                 self.model[2].from_bytes(msg['upper_model']) | ||||
|             self.cfg.update(cfg) | ||||
|         return self | ||||
| 
 | ||||
|  |  | |||
|  | @ -99,6 +99,9 @@ cdef class TransitionSystem: | |||
|     def preprocess_gold(self, GoldParse gold): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def is_gold_parse(self, StateClass state, GoldParse gold): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     cdef Transition lookup_transition(self, object name) except *: | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|  | @ -107,6 +110,8 @@ cdef class TransitionSystem: | |||
| 
 | ||||
|     def is_valid(self, StateClass stcls, move_name): | ||||
|         action = self.lookup_transition(move_name) | ||||
|         if action.move == 0: | ||||
|             return False | ||||
|         return action.is_valid(stcls.c, action.label) | ||||
| 
 | ||||
|     cdef int set_valid(self, int* is_valid, const StateC* st) nogil: | ||||
|  |  | |||
|  | @ -78,3 +78,16 @@ def test_predict_doc_beam(parser, tok2vec, model, doc): | |||
|     parser(doc, beam_width=32, beam_density=0.001) | ||||
|     for word in doc: | ||||
|         print(word.text, word.head, word.dep_) | ||||
| 
 | ||||
| 
 | ||||
| def test_update_doc_beam(parser, tok2vec, model, doc, gold): | ||||
|     parser.model = model | ||||
|     tokvecs, bp_tokvecs = tok2vec.begin_update([doc]) | ||||
|     d_tokvecs = parser.update_beam(([doc], tokvecs), [gold]) | ||||
|     assert d_tokvecs[0].shape == tokvecs[0].shape | ||||
|     def optimize(weights, gradient, key=None): | ||||
|         weights -= 0.001 * gradient | ||||
|     bp_tokvecs(d_tokvecs, sgd=optimize) | ||||
|     assert d_tokvecs[0].sum() == 0. | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										87
									
								
								spacy/tests/parser/test_nn_beam.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								spacy/tests/parser/test_nn_beam.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,87 @@ | |||
| from __future__ import unicode_literals | ||||
| import pytest | ||||
| import numpy | ||||
| from thinc.api import layerize | ||||
| 
 | ||||
| from ...vocab import Vocab | ||||
| from ...syntax.arc_eager import ArcEager | ||||
| from ...tokens import Doc | ||||
| from ...gold import GoldParse | ||||
| from ...syntax._beam_utils import ParserBeam, update_beam | ||||
| from ...syntax.stateclass import StateClass | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def vocab(): | ||||
|     return Vocab() | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def moves(vocab): | ||||
|     aeager = ArcEager(vocab.strings, {}) | ||||
|     aeager.add_action(2, 'nsubj') | ||||
|     aeager.add_action(3, 'dobj') | ||||
|     aeager.add_action(2, 'aux') | ||||
|     return aeager | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def docs(vocab): | ||||
|     return [Doc(vocab, words=['Rats', 'bite', 'things'])] | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def states(docs): | ||||
|     return [StateClass(doc) for doc in docs] | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def tokvecs(docs, vector_size): | ||||
|     output = [] | ||||
|     for doc in docs: | ||||
|         vec = numpy.random.uniform(-0.1, 0.1, (len(doc), vector_size)) | ||||
|         output.append(numpy.asarray(vec)) | ||||
|     return output | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def golds(docs): | ||||
|     return [GoldParse(doc) for doc in docs] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def batch_size(docs): | ||||
|     return len(docs) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def beam_width(): | ||||
|     return 4 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def vector_size(): | ||||
|     return 6 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def beam(moves, states, golds, beam_width): | ||||
|     return ParserBeam(moves, states, golds, width=beam_width, density=0.0) | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def scores(moves, batch_size, beam_width): | ||||
|     return [ | ||||
|         numpy.asarray( | ||||
|             numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), | ||||
|             dtype='f') | ||||
|         for _ in range(batch_size)] | ||||
| 
 | ||||
| 
 | ||||
| def test_create_beam(beam): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| def test_beam_advance(beam, scores): | ||||
|     beam.advance(scores) | ||||
| 
 | ||||
| 
 | ||||
| def test_beam_advance_too_few_scores(beam, scores): | ||||
|     with pytest.raises(IndexError): | ||||
|         beam.advance(scores[:-1]) | ||||
							
								
								
									
										12
									
								
								spacy/tests/regression/test_issue1257.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								spacy/tests/regression/test_issue1257.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,12 @@ | |||
| '''Test tokens compare correctly''' | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..util import get_doc | ||||
| from ...vocab import Vocab | ||||
| 
 | ||||
| 
 | ||||
| def test_issue1257(): | ||||
|     doc1 = get_doc(Vocab(), ['a', 'b', 'c']) | ||||
|     doc2 = get_doc(Vocab(), ['a', 'c', 'e']) | ||||
|     assert doc1[0] != doc2[0] | ||||
|     assert not doc1[0] == doc2[0] | ||||
|  | @ -11,8 +11,8 @@ import pytest | |||
| def taggers(en_vocab): | ||||
|     tagger1 = Tagger(en_vocab) | ||||
|     tagger2 = Tagger(en_vocab) | ||||
|     tagger1.model = tagger1.Model(None, None) | ||||
|     tagger2.model = tagger2.Model(None, None) | ||||
|     tagger1.model = tagger1.Model(8, 8) | ||||
|     tagger2.model = tagger1.model | ||||
|     return (tagger1, tagger2) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): | |||
|     tagger1, tagger2 = taggers | ||||
|     tagger1_b = tagger1.to_bytes() | ||||
|     tagger2_b = tagger2.to_bytes() | ||||
|     assert tagger1_b == tagger2_b | ||||
|     tagger1 = tagger1.from_bytes(tagger1_b) | ||||
|     assert tagger1.to_bytes() == tagger1_b | ||||
|     new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b) | ||||
|  |  | |||
|  | @ -2,6 +2,7 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ..util import get_doc | ||||
| from ...attrs import ORTH, LENGTH | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
|  | @ -89,3 +90,19 @@ def test_spans_are_hashable(en_tokenizer): | |||
|     span3 = tokens[0:2] | ||||
|     assert hash(span3) == hash(span1) | ||||
|   | ||||
| 
 | ||||
| def test_spans_by_character(doc): | ||||
|     span1 = doc[1:-2] | ||||
|     span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE') | ||||
|     assert span1.start_char == span2.start_char | ||||
|     assert span1.end_char == span2.end_char | ||||
|     assert span2.label_ == 'GPE' | ||||
| 
 | ||||
| 
 | ||||
| def test_span_to_array(doc): | ||||
|     span = doc[1:-2] | ||||
|     arr = span.to_array([ORTH, LENGTH]) | ||||
|     assert arr.shape == (len(span), 2) | ||||
|     assert arr[0, 0] == span[0].orth | ||||
|     assert arr[0, 1] == len(span[0]) | ||||
| 
 | ||||
|  |  | |||
|  | @ -79,9 +79,9 @@ def add_vecs_to_vocab(vocab, vectors): | |||
|     """Add list of vector tuples to given vocab. All vectors need to have the | ||||
|     same length. Format: [("text", [1, 2, 3])]""" | ||||
|     length = len(vectors[0][1]) | ||||
|     vocab.resize_vectors(length) | ||||
|     vocab.clear_vectors(length) | ||||
|     for word, vec in vectors: | ||||
|         vocab[word].vector = vec | ||||
|         vocab.set_vector(word, vec) | ||||
|     return vocab | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -14,10 +14,9 @@ def vectors(): | |||
| 
 | ||||
| @pytest.fixture() | ||||
| def vocab(en_vocab, vectors): | ||||
|     #return add_vecs_to_vocab(en_vocab, vectors) | ||||
|     return None | ||||
|     add_vecs_to_vocab(en_vocab, vectors) | ||||
|     return en_vocab | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_vectors_similarity_LL(vocab, vectors): | ||||
|     [(word1, vec1), (word2, vec2)] = vectors | ||||
|     lex1 = vocab[word1] | ||||
|  | @ -31,7 +30,6 @@ def test_vectors_similarity_LL(vocab, vectors): | |||
|     assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1)) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_vectors_similarity_TT(vocab, vectors): | ||||
|     [(word1, vec1), (word2, vec2)] = vectors | ||||
|     doc = get_doc(vocab, words=[word1, word2]) | ||||
|  | @ -44,21 +42,18 @@ def test_vectors_similarity_TT(vocab, vectors): | |||
|     assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1])) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_vectors_similarity_TD(vocab, vectors): | ||||
|     [(word1, vec1), (word2, vec2)] = vectors | ||||
|     doc = get_doc(vocab, words=[word1, word2]) | ||||
|     assert doc.similarity(doc[0]) == doc[0].similarity(doc) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_vectors_similarity_DS(vocab, vectors): | ||||
|     [(word1, vec1), (word2, vec2)] = vectors | ||||
|     doc = get_doc(vocab, words=[word1, word2]) | ||||
|     assert doc.similarity(doc[:2]) == doc[:2].similarity(doc) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.xfail | ||||
| def test_vectors_similarity_TS(vocab, vectors): | ||||
|     [(word1, vec1), (word2, vec2)] = vectors | ||||
|     doc = get_doc(vocab, words=[word1, word2]) | ||||
|  |  | |||
|  | @ -2,6 +2,8 @@ | |||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from ...vectors import Vectors | ||||
| from ...tokenizer import Tokenizer | ||||
| from ..util import add_vecs_to_vocab, get_doc | ||||
| 
 | ||||
| import numpy | ||||
| import pytest | ||||
|  | @ -11,22 +13,42 @@ import pytest | |||
| def strings(): | ||||
|     return ["apple", "orange"] | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def vectors(): | ||||
|     return [ | ||||
|         ("apple", [1, 2, 3]), | ||||
|         ("orange", [-1, -2, -3]), | ||||
|         ('and', [-1, -1, -1]), | ||||
|         ('juice', [5, 5, 10]), | ||||
|         ('pie', [7, 6.3, 8.9])] | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
| def data(): | ||||
|     return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f') | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def vocab(en_vocab, vectors): | ||||
|     add_vecs_to_vocab(en_vocab, vectors) | ||||
|     return en_vocab | ||||
| 
 | ||||
| 
 | ||||
| def test_init_vectors_with_data(strings, data): | ||||
|     v = Vectors(strings, data) | ||||
|     assert v.shape == data.shape | ||||
| 
 | ||||
| def test_init_vectors_with_width(strings): | ||||
|     v = Vectors(strings, 3) | ||||
|     for string in strings: | ||||
|         v.add(string) | ||||
|     assert v.shape == (len(strings), 3) | ||||
| 
 | ||||
| 
 | ||||
| def test_get_vector(strings, data): | ||||
|     v = Vectors(strings, data) | ||||
|     for string in strings: | ||||
|         v.add(string) | ||||
|     assert list(v[strings[0]]) == list(data[0]) | ||||
|     assert list(v[strings[0]]) != list(data[1]) | ||||
|     assert list(v[strings[1]]) != list(data[0]) | ||||
|  | @ -35,6 +57,8 @@ def test_get_vector(strings, data): | |||
| def test_set_vector(strings, data): | ||||
|     orig = data.copy() | ||||
|     v = Vectors(strings, data) | ||||
|     for string in strings: | ||||
|         v.add(string) | ||||
|     assert list(v[strings[0]]) == list(orig[0]) | ||||
|     assert list(v[strings[0]]) != list(orig[1]) | ||||
|     v[strings[0]] = data[1] | ||||
|  | @ -42,125 +66,111 @@ def test_set_vector(strings, data): | |||
|     assert list(v[strings[0]]) != list(orig[0]) | ||||
| 
 | ||||
| 
 | ||||
| # | ||||
| #@pytest.fixture() | ||||
| #def tokenizer_v(vocab): | ||||
| #    return Tokenizer(vocab, {}, None, None, None) | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text', ["apple and orange"]) | ||||
| #def test_vectors_token_vector(tokenizer_v, vectors, text): | ||||
| #    doc = tokenizer_v(text) | ||||
| #    assert vectors[0] == (doc[0].text, list(doc[0].vector)) | ||||
| #    assert vectors[1] == (doc[2].text, list(doc[2].vector)) | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text', ["apple", "orange"]) | ||||
| #def test_vectors_lexeme_vector(vocab, text): | ||||
| #    lex = vocab[text] | ||||
| #    assert list(lex.vector) | ||||
| #    assert lex.vector_norm | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text', [["apple", "and", "orange"]]) | ||||
| #def test_vectors_doc_vector(vocab, text): | ||||
| #    doc = get_doc(vocab, text) | ||||
| #    assert list(doc.vector) | ||||
| #    assert doc.vector_norm | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text', [["apple", "and", "orange"]]) | ||||
| #def test_vectors_span_vector(vocab, text): | ||||
| #    span = get_doc(vocab, text)[0:2] | ||||
| #    assert list(span.vector) | ||||
| #    assert span.vector_norm | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text', ["apple orange"]) | ||||
| #def test_vectors_token_token_similarity(tokenizer_v, text): | ||||
| #    doc = tokenizer_v(text) | ||||
| #    assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) | ||||
| #    assert 0.0 < doc[0].similarity(doc[1]) < 1.0 | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text1,text2', [("apple", "orange")]) | ||||
| #def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2): | ||||
| #    token = tokenizer_v(text1) | ||||
| #    lex = vocab[text2] | ||||
| #    assert token.similarity(lex) == lex.similarity(token) | ||||
| #    assert 0.0 < token.similarity(lex) < 1.0 | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| #def test_vectors_token_span_similarity(vocab, text): | ||||
| #    doc = get_doc(vocab, text) | ||||
| #    assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0]) | ||||
| #    assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0 | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| #def test_vectors_token_doc_similarity(vocab, text): | ||||
| #    doc = get_doc(vocab, text) | ||||
| #    assert doc[0].similarity(doc) == doc.similarity(doc[0]) | ||||
| #    assert 0.0 < doc[0].similarity(doc) < 1.0 | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| #def test_vectors_lexeme_span_similarity(vocab, text): | ||||
| #    doc = get_doc(vocab, text) | ||||
| #    lex = vocab[text[0]] | ||||
| #    assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex) | ||||
| #    assert 0.0 < doc.similarity(doc[1:3]) < 1.0 | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text1,text2', [("apple", "orange")]) | ||||
| #def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2): | ||||
| #    lex1 = vocab[text1] | ||||
| #    lex2 = vocab[text2] | ||||
| #    assert lex1.similarity(lex2) == lex2.similarity(lex1) | ||||
| #    assert 0.0 < lex1.similarity(lex2) < 1.0 | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| #def test_vectors_lexeme_doc_similarity(vocab, text): | ||||
| #    doc = get_doc(vocab, text) | ||||
| #    lex = vocab[text[0]] | ||||
| #    assert lex.similarity(doc) == doc.similarity(lex) | ||||
| #    assert 0.0 < lex.similarity(doc) < 1.0 | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| #def test_vectors_span_span_similarity(vocab, text): | ||||
| #    doc = get_doc(vocab, text) | ||||
| #    assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) | ||||
| #    assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0 | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| #def test_vectors_span_doc_similarity(vocab, text): | ||||
| #    doc = get_doc(vocab, text) | ||||
| #    assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) | ||||
| #    assert 0.0 < doc[0:2].similarity(doc) < 1.0 | ||||
| # | ||||
| # | ||||
| #@pytest.mark.xfail | ||||
| #@pytest.mark.parametrize('text1,text2', [ | ||||
| #    (["apple", "and", "apple", "pie"], ["orange", "juice"])]) | ||||
| #def test_vectors_doc_doc_similarity(vocab, text1, text2): | ||||
| #    doc1 = get_doc(vocab, text1) | ||||
| #    doc2 = get_doc(vocab, text2) | ||||
| #    assert doc1.similarity(doc2) == doc2.similarity(doc1) | ||||
| #    assert 0.0 < doc1.similarity(doc2) < 1.0 | ||||
| 
 | ||||
| @pytest.fixture() | ||||
| def tokenizer_v(vocab): | ||||
|     return Tokenizer(vocab, {}, None, None, None) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["apple and orange"]) | ||||
| def test_vectors_token_vector(tokenizer_v, vectors, text): | ||||
|     doc = tokenizer_v(text) | ||||
|     assert vectors[0] == (doc[0].text, list(doc[0].vector)) | ||||
|     assert vectors[1] == (doc[2].text, list(doc[2].vector)) | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["apple", "orange"]) | ||||
| def test_vectors_lexeme_vector(vocab, text): | ||||
|     lex = vocab[text] | ||||
|     assert list(lex.vector) | ||||
|     assert lex.vector_norm | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', [["apple", "and", "orange"]]) | ||||
| def test_vectors_doc_vector(vocab, text): | ||||
|     doc = get_doc(vocab, text) | ||||
|     assert list(doc.vector) | ||||
|     assert doc.vector_norm | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', [["apple", "and", "orange"]]) | ||||
| def test_vectors_span_vector(vocab, text): | ||||
|     span = get_doc(vocab, text)[0:2] | ||||
|     assert list(span.vector) | ||||
|     assert span.vector_norm | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', ["apple orange"]) | ||||
| def test_vectors_token_token_similarity(tokenizer_v, text): | ||||
|     doc = tokenizer_v(text) | ||||
|     assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) | ||||
|     assert -1. < doc[0].similarity(doc[1]) < 1.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text1,text2', [("apple", "orange")]) | ||||
| def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2): | ||||
|     token = tokenizer_v(text1) | ||||
|     lex = vocab[text2] | ||||
|     assert token.similarity(lex) == lex.similarity(token) | ||||
|     assert -1. < token.similarity(lex) < 1.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| def test_vectors_token_span_similarity(vocab, text): | ||||
|     doc = get_doc(vocab, text) | ||||
|     assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0]) | ||||
|     assert -1. < doc[0].similarity(doc[1:3]) < 1.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| def test_vectors_token_doc_similarity(vocab, text): | ||||
|     doc = get_doc(vocab, text) | ||||
|     assert doc[0].similarity(doc) == doc.similarity(doc[0]) | ||||
|     assert -1. < doc[0].similarity(doc) < 1.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| def test_vectors_lexeme_span_similarity(vocab, text): | ||||
|     doc = get_doc(vocab, text) | ||||
|     lex = vocab[text[0]] | ||||
|     assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex) | ||||
|     assert -1. < doc.similarity(doc[1:3]) < 1.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text1,text2', [("apple", "orange")]) | ||||
| def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2): | ||||
|     lex1 = vocab[text1] | ||||
|     lex2 = vocab[text2] | ||||
|     assert lex1.similarity(lex2) == lex2.similarity(lex1) | ||||
|     assert -1. < lex1.similarity(lex2) < 1.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| def test_vectors_lexeme_doc_similarity(vocab, text): | ||||
|     doc = get_doc(vocab, text) | ||||
|     lex = vocab[text[0]] | ||||
|     assert lex.similarity(doc) == doc.similarity(lex) | ||||
|     assert -1. < lex.similarity(doc) < 1.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| def test_vectors_span_span_similarity(vocab, text): | ||||
|     doc = get_doc(vocab, text) | ||||
|     assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) | ||||
|     assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) | ||||
| def test_vectors_span_doc_similarity(vocab, text): | ||||
|     doc = get_doc(vocab, text) | ||||
|     assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) | ||||
|     assert -1. < doc[0:2].similarity(doc) < 1.0 | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize('text1,text2', [ | ||||
|     (["apple", "and", "apple", "pie"], ["orange", "juice"])]) | ||||
| def test_vectors_doc_doc_similarity(vocab, text1, text2): | ||||
|     doc1 = get_doc(vocab, text1) | ||||
|     doc2 = get_doc(vocab, text2) | ||||
|     assert doc1.similarity(doc2) == doc2.similarity(doc1) | ||||
|     assert -1. < doc1.similarity(doc2) < 1.0 | ||||
|  |  | |||
|  | @ -238,6 +238,29 @@ cdef class Doc: | |||
|     def doc(self): | ||||
|         return self | ||||
| 
 | ||||
|     def char_span(self, int start_idx, int end_idx, label=0, vector=None): | ||||
|         """Create a `Span` object from the slice `doc.text[start : end]`. | ||||
| 
 | ||||
|         doc (Doc): The parent document. | ||||
|         start (int): The index of the first character of the span. | ||||
|         end (int): The index of the first character after the span. | ||||
|         label (uint64 or string): A label to attach to the Span, e.g. for named entities. | ||||
|         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. | ||||
|         RETURNS (Span): The newly constructed object. | ||||
|         """ | ||||
|         if not isinstance(label, int): | ||||
|             label = self.vocab.strings.add(label) | ||||
|         cdef int start = token_by_start(self.c, self.length, start_idx) | ||||
|         if start == -1: | ||||
|             return None | ||||
|         cdef int end = token_by_end(self.c, self.length, end_idx) | ||||
|         if end == -1: | ||||
|             return None | ||||
|         # Currently we have the token index, we want the range-end index | ||||
|         end += 1 | ||||
|         cdef Span span = Span(self, start, end, label=label, vector=vector) | ||||
|         return span | ||||
| 
 | ||||
|     def similarity(self, other): | ||||
|         """Make a semantic similarity estimate. The default estimate is cosine | ||||
|         similarity using an average of word vectors. | ||||
|  |  | |||
|  | @ -15,5 +15,5 @@ cdef class Span: | |||
|     cdef public _vector | ||||
|     cdef public _vector_norm | ||||
| 
 | ||||
| 
 | ||||
|     cpdef int _recalculate_indices(self) except -1 | ||||
|     cpdef np.ndarray to_array(self, object features) | ||||
|  |  | |||
|  | @ -7,7 +7,7 @@ import numpy | |||
| import numpy.linalg | ||||
| from libc.math cimport sqrt | ||||
| 
 | ||||
| from .doc cimport token_by_start, token_by_end | ||||
| from .doc cimport token_by_start, token_by_end, get_token_attr | ||||
| from ..structs cimport TokenC, LexemeC | ||||
| from ..typedefs cimport flags_t, attr_t, hash_t | ||||
| from ..attrs cimport attr_id_t | ||||
|  | @ -135,6 +135,29 @@ cdef class Span: | |||
|             return 0.0 | ||||
|         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) | ||||
| 
 | ||||
|     cpdef np.ndarray to_array(self, object py_attr_ids): | ||||
|         """Given a list of M attribute IDs, export the tokens to a numpy | ||||
|         `ndarray` of shape `(N, M)`, where `N` is the length of the document. | ||||
|         The values will be 32-bit integers. | ||||
| 
 | ||||
|         attr_ids (list[int]): A list of attribute ID ints. | ||||
|         RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row | ||||
|             per word, and one column per attribute indicated in the input | ||||
|             `attr_ids`. | ||||
|         """ | ||||
|         cdef int i, j | ||||
|         cdef attr_id_t feature | ||||
|         cdef np.ndarray[attr_t, ndim=2] output | ||||
|         # Make an array from the attributes --- otherwise our inner loop is Python | ||||
|         # dict iteration. | ||||
|         cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) | ||||
|         cdef int length = self.end - self.start | ||||
|         output = numpy.ndarray(shape=(length, len(attr_ids)), dtype=numpy.uint64) | ||||
|         for i in range(self.start, self.end): | ||||
|             for j, feature in enumerate(attr_ids): | ||||
|                 output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature) | ||||
|         return output | ||||
| 
 | ||||
|     cpdef int _recalculate_indices(self) except -1: | ||||
|         if self.end > self.doc.length \ | ||||
|         or self.doc.c[self.start].idx != self.start_char \ | ||||
|  |  | |||
|  | @ -62,18 +62,26 @@ cdef class Token: | |||
| 
 | ||||
|     def __richcmp__(self, Token other, int op): | ||||
|         # http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html | ||||
|         cdef Doc my_doc = self.doc | ||||
|         cdef Doc other_doc = other.doc | ||||
|         my = self.idx | ||||
|         their = other.idx if other is not None else None | ||||
|         if op == 0: | ||||
|             return my < their | ||||
|         elif op == 2: | ||||
|             return my == their | ||||
|             if my_doc is other_doc: | ||||
|                 return my == their | ||||
|             else: | ||||
|                 return False | ||||
|         elif op == 4: | ||||
|             return my > their | ||||
|         elif op == 1: | ||||
|             return my <= their | ||||
|         elif op == 3: | ||||
|             return my != their | ||||
|             if my_doc is other_doc: | ||||
|                 return my != their | ||||
|             else: | ||||
|                 return True | ||||
|         elif op == 5: | ||||
|             return my >= their | ||||
|         else: | ||||
|  |  | |||
|  | @ -22,7 +22,7 @@ import ujson | |||
| 
 | ||||
| from .symbols import ORTH | ||||
| from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ | ||||
| from .compat import copy_array, normalize_string_keys, getattr_ | ||||
| from .compat import copy_array, normalize_string_keys, getattr_, import_file | ||||
| 
 | ||||
| 
 | ||||
| LANGUAGES = {} | ||||
|  | @ -112,15 +112,13 @@ def load_model(name, **overrides): | |||
| 
 | ||||
| def load_model_from_link(name, **overrides): | ||||
|     """Load a model from a shortcut link, or directory in spaCy data path.""" | ||||
|     init_file = get_data_path() / name / '__init__.py' | ||||
|     spec = importlib.util.spec_from_file_location(name, str(init_file)) | ||||
|     path = get_data_path() / name / '__init__.py' | ||||
|     try: | ||||
|         cls = importlib.util.module_from_spec(spec) | ||||
|         cls = import_file(name, path) | ||||
|     except AttributeError: | ||||
|         raise IOError( | ||||
|             "Cant' load '%s'. If you're using a shortcut link, make sure it " | ||||
|             "points to a valid model package (not just a data directory)." % name) | ||||
|     spec.loader.exec_module(cls) | ||||
|     return cls.load(**overrides) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,18 +1,25 @@ | |||
| from __future__ import unicode_literals | ||||
| from libc.stdint cimport int32_t, uint64_t | ||||
| import numpy | ||||
| from collections import OrderedDict | ||||
| import msgpack | ||||
| import msgpack_numpy | ||||
| msgpack_numpy.patch() | ||||
| cimport numpy as np | ||||
| 
 | ||||
| from .typedefs cimport attr_t | ||||
| from .strings cimport StringStore | ||||
| from . import util | ||||
| from .compat import basestring_ | ||||
| 
 | ||||
| 
 | ||||
| cdef class Vectors: | ||||
|     '''Store, save and load word vectors.''' | ||||
|     cdef public object data | ||||
|     cdef readonly StringStore strings | ||||
|     cdef public object key2i | ||||
|     cdef public object key2row | ||||
|     cdef public object keys | ||||
|     cdef public int i | ||||
| 
 | ||||
|     def __init__(self, strings, data_or_width): | ||||
|         self.strings = StringStore() | ||||
|  | @ -21,10 +28,10 @@ cdef class Vectors: | |||
|                                            dtype='f') | ||||
|         else: | ||||
|             data = data_or_width | ||||
|         self.i = 0 | ||||
|         self.data = data | ||||
|         self.key2i = {} | ||||
|         for i, string in enumerate(strings): | ||||
|             self.key2i[self.strings.add(string)] = i | ||||
|         self.key2row = {} | ||||
|         self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')  | ||||
| 
 | ||||
|     def __reduce__(self): | ||||
|         return (Vectors, (self.strings, self.data)) | ||||
|  | @ -32,7 +39,7 @@ cdef class Vectors: | |||
|     def __getitem__(self, key): | ||||
|         if isinstance(key, basestring): | ||||
|             key = self.strings[key] | ||||
|         i = self.key2i[key] | ||||
|         i = self.key2row[key] | ||||
|         if i is None: | ||||
|             raise KeyError(key) | ||||
|         else: | ||||
|  | @ -41,14 +48,36 @@ cdef class Vectors: | |||
|     def __setitem__(self, key, vector): | ||||
|         if isinstance(key, basestring): | ||||
|             key = self.strings.add(key) | ||||
|         i = self.key2i[key] | ||||
|         i = self.key2row[key] | ||||
|         self.data[i] = vector | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         yield from self.data | ||||
| 
 | ||||
|     def __len__(self): | ||||
|         return len(self.strings) | ||||
|         return self.i | ||||
| 
 | ||||
|     def __contains__(self, key): | ||||
|         if isinstance(key, basestring_): | ||||
|             key = self.strings[key] | ||||
|         return key in self.key2row | ||||
| 
 | ||||
|     def add(self, key, vector=None): | ||||
|         if isinstance(key, basestring_): | ||||
|             key = self.strings.add(key) | ||||
|         if key not in self.key2row: | ||||
|             i = self.i | ||||
|             if i >= self.keys.shape[0]: | ||||
|                 self.keys.resize((self.keys.shape[0]*2,)) | ||||
|                 self.data.resize((self.data.shape[0]*2, self.data.shape[1])) | ||||
|             self.key2row[key] = self.i | ||||
|             self.keys[self.i] = key | ||||
|             self.i += 1 | ||||
|         else: | ||||
|             i = self.key2row[key] | ||||
|         if vector is not None: | ||||
|             self.data[i] = vector | ||||
|         return i | ||||
| 
 | ||||
|     def items(self): | ||||
|         for i, string in enumerate(self.strings): | ||||
|  | @ -61,34 +90,60 @@ cdef class Vectors: | |||
|     def most_similar(self, key): | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     def to_disk(self, path): | ||||
|         raise NotImplementedError | ||||
|     def to_disk(self, path, **exclude): | ||||
|         serializers = OrderedDict(( | ||||
|             ('vectors', lambda p: numpy.save(p.open('wb'), self.data, allow_pickle=False)), | ||||
|             ('keys', lambda p: numpy.save(p.open('wb'), self.keys, allow_pickle=False)), | ||||
|         )) | ||||
|         return util.to_disk(path, serializers, exclude) | ||||
| 
 | ||||
|     def from_disk(self, path): | ||||
|         raise NotImplementedError | ||||
|     def from_disk(self, path, **exclude): | ||||
|         def load_keys(path): | ||||
|             if path.exists(): | ||||
|                 self.keys = numpy.load(path) | ||||
|                 for i, key in enumerate(self.keys): | ||||
|                     self.keys[i] = key | ||||
|                     self.key2row[key] = i | ||||
| 
 | ||||
|         def load_vectors(path): | ||||
|             if path.exists(): | ||||
|                 self.data = numpy.load(path) | ||||
| 
 | ||||
|         serializers = OrderedDict(( | ||||
|             ('keys', load_keys), | ||||
|             ('vectors', load_vectors), | ||||
|         )) | ||||
|         util.from_disk(path, serializers, exclude) | ||||
|         return self | ||||
| 
 | ||||
|     def to_bytes(self, **exclude): | ||||
|         def serialize_weights(): | ||||
|             if hasattr(self.weights, 'to_bytes'): | ||||
|                 return self.weights.to_bytes() | ||||
|             if hasattr(self.data, 'to_bytes'): | ||||
|                 return self.data.to_bytes() | ||||
|             else: | ||||
|                 return msgpack.dumps(self.weights) | ||||
| 
 | ||||
|                 return msgpack.dumps(self.data) | ||||
|         serializers = OrderedDict(( | ||||
|             ('strings', lambda: self.strings.to_bytes()), | ||||
|             ('weights', serialize_weights) | ||||
|             ('keys', lambda: msgpack.dumps(self.keys)), | ||||
|             ('vectors', serialize_weights) | ||||
|         )) | ||||
|         return util.to_bytes(serializers, exclude) | ||||
| 
 | ||||
|     def from_bytes(self, data, **exclude): | ||||
|         def deserialize_weights(b): | ||||
|             if hasattr(self.weights, 'from_bytes'): | ||||
|                 self.weights.from_bytes() | ||||
|             if hasattr(self.data, 'from_bytes'): | ||||
|                 self.data.from_bytes() | ||||
|             else: | ||||
|                 self.weights = msgpack.loads(b) | ||||
|                 self.data = msgpack.loads(b) | ||||
| 
 | ||||
|         def load_keys(keys): | ||||
|             self.keys.resize((len(keys),)) | ||||
|             for i, key in enumerate(keys): | ||||
|                 self.keys[i] = key | ||||
|                 self.key2row[key] = i | ||||
| 
 | ||||
|         deserializers = OrderedDict(( | ||||
|             ('strings', lambda b: self.strings.from_bytes(b)), | ||||
|             ('weights', deserialize_weights) | ||||
|             ('keys', lambda b: load_keys(msgpack.loads(b))), | ||||
|             ('vectors', deserialize_weights) | ||||
|         )) | ||||
|         return util.from_bytes(deserializers, exclude) | ||||
|         util.from_bytes(data, deserializers, exclude) | ||||
|         return self | ||||
|  |  | |||
|  | @ -19,9 +19,10 @@ from .tokens.token cimport Token | |||
| from .attrs cimport PROB, LANG | ||||
| from .structs cimport SerializedLexemeC | ||||
| 
 | ||||
| from .compat import copy_reg, pickle | ||||
| from .compat import copy_reg, pickle, basestring_ | ||||
| from .lemmatizer import Lemmatizer | ||||
| from .attrs import intify_attrs | ||||
| from .vectors import Vectors | ||||
| from . import util | ||||
| from . import attrs | ||||
| from . import symbols | ||||
|  | @ -63,6 +64,7 @@ cdef class Vocab: | |||
|                 self.strings.add(name) | ||||
|         self.lex_attr_getters = lex_attr_getters | ||||
|         self.morphology = Morphology(self.strings, tag_map, lemmatizer) | ||||
|         self.vectors = Vectors(self.strings, 300) | ||||
| 
 | ||||
|     property lang: | ||||
|         def __get__(self): | ||||
|  | @ -242,13 +244,15 @@ cdef class Vocab: | |||
| 
 | ||||
|     @property | ||||
|     def vectors_length(self): | ||||
|         raise NotImplementedError | ||||
|         return len(self.vectors) | ||||
| 
 | ||||
|     def clear_vectors(self): | ||||
|     def clear_vectors(self, new_dim=None): | ||||
|         """Drop the current vector table. Because all vectors must be the same | ||||
|         width, you have to call this to change the size of the vectors. | ||||
|         """ | ||||
|         raise NotImplementedError | ||||
|         if new_dim is None: | ||||
|             new_dim = self.vectors.data.shape[1] | ||||
|         self.vectors = Vectors(self.strings, new_dim) | ||||
| 
 | ||||
|     def get_vector(self, orth): | ||||
|         """Retrieve a vector for a word in the vocabulary. | ||||
|  | @ -262,7 +266,9 @@ cdef class Vocab: | |||
| 
 | ||||
|         RAISES: If no vectors data is loaded, ValueError is raised. | ||||
|         """ | ||||
|         raise NotImplementedError | ||||
|         if isinstance(orth, basestring_): | ||||
|             orth = self.strings.add(orth) | ||||
|         return self.vectors[orth] | ||||
| 
 | ||||
|     def set_vector(self, orth, vector): | ||||
|         """Set a vector for a word in the vocabulary. | ||||
|  | @ -272,15 +278,19 @@ cdef class Vocab: | |||
|         RETURNS: | ||||
|             None | ||||
|         """ | ||||
|         raise NotImplementedError | ||||
|         if not isinstance(orth, basestring_): | ||||
|             orth = self.strings[orth] | ||||
|         self.vectors.add(orth, vector=vector) | ||||
| 
 | ||||
|     def has_vector(self, orth): | ||||
|         """Check whether a word has a vector. Returns False if no | ||||
|         vectors have been loaded. Words can be looked up by string | ||||
|         or int ID.""" | ||||
|         return False | ||||
|         if isinstance(orth, basestring_): | ||||
|             orth = self.strings.add(orth) | ||||
|         return orth in self.vectors | ||||
| 
 | ||||
|     def to_disk(self, path): | ||||
|     def to_disk(self, path, **exclude): | ||||
|         """Save the current state to a directory. | ||||
| 
 | ||||
|         path (unicode or Path): A path to a directory, which will be created if | ||||
|  | @ -292,8 +302,10 @@ cdef class Vocab: | |||
|         self.strings.to_disk(path / 'strings.json') | ||||
|         with (path / 'lexemes.bin').open('wb') as file_: | ||||
|             file_.write(self.lexemes_to_bytes()) | ||||
|         if self.vectors is not None: | ||||
|             self.vectors.to_disk(path) | ||||
| 
 | ||||
|     def from_disk(self, path): | ||||
|     def from_disk(self, path, **exclude): | ||||
|         """Loads state from a directory. Modifies the object in place and | ||||
|         returns it. | ||||
| 
 | ||||
|  | @ -305,6 +317,8 @@ cdef class Vocab: | |||
|         self.strings.from_disk(path / 'strings.json') | ||||
|         with (path / 'lexemes.bin').open('rb') as file_: | ||||
|             self.lexemes_from_bytes(file_.read()) | ||||
|         if self.vectors is not None: | ||||
|             self.vectors.from_disk(path, exclude='strings.json') | ||||
|         return self | ||||
| 
 | ||||
|     def to_bytes(self, **exclude): | ||||
|  | @ -313,9 +327,16 @@ cdef class Vocab: | |||
|         **exclude: Named attributes to prevent from being serialized. | ||||
|         RETURNS (bytes): The serialized form of the `Vocab` object. | ||||
|         """ | ||||
|         def deserialize_vectors(): | ||||
|             if self.vectors is None: | ||||
|                 return None | ||||
|             else: | ||||
|                 return self.vectors.to_bytes(exclude='strings.json') | ||||
|   | ||||
|         getters = OrderedDict(( | ||||
|             ('strings', lambda: self.strings.to_bytes()), | ||||
|             ('lexemes', lambda: self.lexemes_to_bytes()), | ||||
|             ('vectors', deserialize_vectors) | ||||
|         )) | ||||
|         return util.to_bytes(getters, exclude) | ||||
| 
 | ||||
|  | @ -326,9 +347,15 @@ cdef class Vocab: | |||
|         **exclude: Named attributes to prevent from being loaded. | ||||
|         RETURNS (Vocab): The `Vocab` object. | ||||
|         """ | ||||
|         def serialize_vectors(b): | ||||
|             if self.vectors is None: | ||||
|                 return None | ||||
|             else: | ||||
|                 return self.vectors.from_bytes(b, exclude='strings') | ||||
|         setters = OrderedDict(( | ||||
|             ('strings', lambda b: self.strings.from_bytes(b)), | ||||
|             ('lexemes', lambda b: self.lexemes_from_bytes(b)), | ||||
|             ('vectors', lambda b: serialize_vectors(b)) | ||||
|         )) | ||||
|         util.from_bytes(bytes_data, setters, exclude) | ||||
|         return self | ||||
|  |  | |||
|  | @ -112,6 +112,10 @@ | |||
| .u-nowrap | ||||
|     white-space: nowrap | ||||
| 
 | ||||
| .u-break.u-break | ||||
|     word-wrap: break-word | ||||
|     white-space: initial | ||||
| 
 | ||||
| .u-no-border | ||||
|     border: none | ||||
| 
 | ||||
|  |  | |||
|  | @ -140,6 +140,43 @@ p Get the number of tokens in the document. | |||
|         +cell int | ||||
|         +cell The number of tokens in the document. | ||||
| 
 | ||||
| +h(2, "char_span") Doc.char_span | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p Create a #[code Span] object from the slice #[code doc.text[start : end]]. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     doc = nlp(u'I like New York') | ||||
|     span = doc.char_span(7, 15, label=u'GPE') | ||||
|     assert span.text == 'New York' | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code start] | ||||
|         +cell int | ||||
|         +cell The index of the first character of the span. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code end] | ||||
|         +cell int | ||||
|         +cell The index of the first character after the span. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code label] | ||||
|         +cell uint64 / unicode | ||||
|         +cell A label to attach to the Span, e.g. for named entities. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code vector] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A meaning representation of the span. | ||||
| 
 | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code Span] | ||||
|         +cell The newly constructed object. | ||||
| 
 | ||||
| +h(2, "similarity") Doc.similarity | ||||
|     +tag method | ||||
|     +tag-model("vectors") | ||||
|  | @ -211,12 +248,12 @@ p | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code attr_ids] | ||||
|         +cell ints | ||||
|         +cell list | ||||
|         +cell A list of attribute ID ints. | ||||
| 
 | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code numpy.ndarray[ndim=2, dtype='int32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] | ||||
|         +cell | ||||
|             |  The exported attributes as a 2D numpy array, with one row per | ||||
|             |  token and one column per attribute. | ||||
|  | @ -245,7 +282,7 @@ p | |||
| 
 | ||||
|     +row | ||||
|         +cell #[code array] | ||||
|         +cell #[code numpy.ndarray[ndim=2, dtype='int32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] | ||||
|         +cell The attribute values to load. | ||||
| 
 | ||||
|     +footrow | ||||
|  | @ -509,7 +546,7 @@ p | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A 1D numpy array representing the document's semantics. | ||||
| 
 | ||||
| +h(2, "vector_norm") Doc.vector_norm | ||||
|  |  | |||
|  | @ -111,6 +111,14 @@ p | |||
|         +cell - | ||||
|         +cell A sequence of unicode objects. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code as_tuples] | ||||
|         +cell bool | ||||
|         +cell | ||||
|             |  If set to #[code True], inputs should be a sequence of | ||||
|             |  #[code (text, context)] tuples. Output will then be a sequence of | ||||
|             |  #[code (doc, context)] tuples. Defaults to #[code False]. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code n_threads] | ||||
|         +cell int | ||||
|  |  | |||
|  | @ -129,7 +129,7 @@ p A real-valued meaning representation. | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A 1D numpy array representing the lexeme's semantics. | ||||
| 
 | ||||
| +h(2, "vector_norm") Lexeme.vector_norm | ||||
|  |  | |||
|  | @ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]]. | |||
| 
 | ||||
|     +row | ||||
|         +cell #[code vector] | ||||
|         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A meaning representation of the span. | ||||
| 
 | ||||
|     +footrow | ||||
|  | @ -145,11 +145,47 @@ p | |||
|         +cell float | ||||
|         +cell A scalar similarity score. Higher is more similar. | ||||
| 
 | ||||
| +h(2, "to_array") Span.to_array | ||||
|     +tag method | ||||
|     +tag-new(2) | ||||
| 
 | ||||
| p | ||||
|     |  Given a list of #[code M] attribute IDs, export the tokens to a numpy | ||||
|     |  #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of | ||||
|     |  the document. The values will be 32-bit integers. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA | ||||
|     doc = nlp(u'I like New York in Autumn.') | ||||
|     span = doc[2:3] | ||||
|     # All strings mapped to integers, for easy export to numpy | ||||
|     np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code attr_ids] | ||||
|         +cell list | ||||
|         +cell A list of attribute ID ints. | ||||
| 
 | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code.u-break numpy.ndarray[long, ndim=2]] | ||||
|         +cell | ||||
|             |  A feature matrix, with one row per word, and one column per | ||||
|             |  attribute indicated in the input #[code attr_ids]. | ||||
| 
 | ||||
| +h(2, "merge") Span.merge | ||||
|     +tag method | ||||
| 
 | ||||
| p Retokenize the document, such that the span is merged into a single token. | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     doc = nlp(u'I like New York in Autumn.') | ||||
|     span = doc[2:3] | ||||
|     span.merge() | ||||
|     assert len(doc) == 6 | ||||
|     assert doc[2].text == 'New York' | ||||
| 
 | ||||
| +table(["Name", "Type", "Description"]) | ||||
|     +row | ||||
|         +cell #[code **attributes] | ||||
|  | @ -270,7 +306,7 @@ p | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A 1D numpy array representing the span's semantics. | ||||
| 
 | ||||
| +h(2, "vector_norm") Span.vector_norm | ||||
|  |  | |||
|  | @ -250,7 +250,7 @@ p A real-valued meaning representation. | |||
| +table(["Name", "Type", "Description"]) | ||||
|     +footrow | ||||
|         +cell returns | ||||
|         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] | ||||
|         +cell A 1D numpy array representing the token's semantics. | ||||
| 
 | ||||
| +h(2, "vector_norm") Span.vector_norm | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user