mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	💫 Fix loading of multiple vector models
			
			
This commit is contained in:
		
						commit
						98e9cda677
					
				
							
								
								
									
										21
									
								
								spacy/_ml.py
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								spacy/_ml.py
									
									
									
									
									
								
							| 
						 | 
					@ -242,6 +242,11 @@ class PrecomputableAffine(Model):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def link_vectors_to_models(vocab):
 | 
					def link_vectors_to_models(vocab):
 | 
				
			||||||
    vectors = vocab.vectors
 | 
					    vectors = vocab.vectors
 | 
				
			||||||
 | 
					    if vectors.name is None:
 | 
				
			||||||
 | 
					        vectors.name = VECTORS_KEY
 | 
				
			||||||
 | 
					        print(
 | 
				
			||||||
 | 
					            "Warning: Unnamed vectors -- this won't allow multiple vectors "
 | 
				
			||||||
 | 
					            "models to be loaded. (Shape: (%d, %d))" % vectors.data.shape)
 | 
				
			||||||
    ops = Model.ops
 | 
					    ops = Model.ops
 | 
				
			||||||
    for word in vocab:
 | 
					    for word in vocab:
 | 
				
			||||||
        if word.orth in vectors.key2row:
 | 
					        if word.orth in vectors.key2row:
 | 
				
			||||||
| 
						 | 
					@ -251,11 +256,11 @@ def link_vectors_to_models(vocab):
 | 
				
			||||||
    data = ops.asarray(vectors.data)
 | 
					    data = ops.asarray(vectors.data)
 | 
				
			||||||
    # Set an entry here, so that vectors are accessed by StaticVectors
 | 
					    # Set an entry here, so that vectors are accessed by StaticVectors
 | 
				
			||||||
    # (unideal, I know)
 | 
					    # (unideal, I know)
 | 
				
			||||||
    thinc.extra.load_nlp.VECTORS[(ops.device, VECTORS_KEY)] = data
 | 
					    thinc.extra.load_nlp.VECTORS[(ops.device, vectors.name)] = data
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def Tok2Vec(width, embed_size, **kwargs):
 | 
					def Tok2Vec(width, embed_size, **kwargs):
 | 
				
			||||||
    pretrained_dims = kwargs.get('pretrained_dims', 0)
 | 
					    pretrained_vectors = kwargs.get('pretrained_vectors', None)
 | 
				
			||||||
    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
 | 
					    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
 | 
				
			||||||
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
 | 
					    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
 | 
				
			||||||
    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
 | 
					    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
 | 
				
			||||||
| 
						 | 
					@ -268,16 +273,16 @@ def Tok2Vec(width, embed_size, **kwargs):
 | 
				
			||||||
                           name='embed_suffix')
 | 
					                           name='embed_suffix')
 | 
				
			||||||
        shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
 | 
					        shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
 | 
				
			||||||
                          name='embed_shape')
 | 
					                          name='embed_shape')
 | 
				
			||||||
        if pretrained_dims is not None and pretrained_dims >= 1:
 | 
					        if pretrained_vectors is not None:
 | 
				
			||||||
            glove = StaticVectors(VECTORS_KEY, width, column=cols.index(ID))
 | 
					            glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            embed = uniqued(
 | 
					            embed = uniqued(
 | 
				
			||||||
                (glove | norm | prefix | suffix | shape)
 | 
					                (glove | norm | prefix | suffix | shape)
 | 
				
			||||||
                >> LN(Maxout(width, width*5, pieces=3)), column=5)
 | 
					                >> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH))
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            embed = uniqued(
 | 
					            embed = uniqued(
 | 
				
			||||||
                (norm | prefix | suffix | shape)
 | 
					                (norm | prefix | suffix | shape)
 | 
				
			||||||
                >> LN(Maxout(width, width*4, pieces=3)), column=5)
 | 
					                >> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        convolution = Residual(
 | 
					        convolution = Residual(
 | 
				
			||||||
            ExtractWindow(nW=1)
 | 
					            ExtractWindow(nW=1)
 | 
				
			||||||
| 
						 | 
					@ -433,13 +438,13 @@ def build_tagger_model(nr_class, **cfg):
 | 
				
			||||||
        token_vector_width = cfg['token_vector_width']
 | 
					        token_vector_width = cfg['token_vector_width']
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        token_vector_width = util.env_opt('token_vector_width', 128)
 | 
					        token_vector_width = util.env_opt('token_vector_width', 128)
 | 
				
			||||||
    pretrained_dims = cfg.get('pretrained_dims', 0)
 | 
					    pretrained_vectors = cfg.get('pretrained_vectors')
 | 
				
			||||||
    with Model.define_operators({'>>': chain, '+': add}):
 | 
					    with Model.define_operators({'>>': chain, '+': add}):
 | 
				
			||||||
        if 'tok2vec' in cfg:
 | 
					        if 'tok2vec' in cfg:
 | 
				
			||||||
            tok2vec = cfg['tok2vec']
 | 
					            tok2vec = cfg['tok2vec']
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            tok2vec = Tok2Vec(token_vector_width, embed_size,
 | 
					            tok2vec = Tok2Vec(token_vector_width, embed_size,
 | 
				
			||||||
                              pretrained_dims=pretrained_dims)
 | 
					                              pretrained_vectors=pretrained_vectors)
 | 
				
			||||||
        softmax = with_flatten(Softmax(nr_class, token_vector_width))
 | 
					        softmax = with_flatten(Softmax(nr_class, token_vector_width))
 | 
				
			||||||
        model = (
 | 
					        model = (
 | 
				
			||||||
            tok2vec
 | 
					            tok2vec
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -93,6 +93,7 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
 | 
				
			||||||
    meta['pipeline'] = pipeline
 | 
					    meta['pipeline'] = pipeline
 | 
				
			||||||
    nlp.meta.update(meta)
 | 
					    nlp.meta.update(meta)
 | 
				
			||||||
    if vectors:
 | 
					    if vectors:
 | 
				
			||||||
 | 
					        print("Load vectors model", vectors)
 | 
				
			||||||
        util.load_model(vectors, vocab=nlp.vocab)
 | 
					        util.load_model(vectors, vocab=nlp.vocab)
 | 
				
			||||||
        for lex in nlp.vocab:
 | 
					        for lex in nlp.vocab:
 | 
				
			||||||
            values = {}
 | 
					            values = {}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -133,6 +133,8 @@ class Language(object):
 | 
				
			||||||
        if vocab is True:
 | 
					        if vocab is True:
 | 
				
			||||||
            factory = self.Defaults.create_vocab
 | 
					            factory = self.Defaults.create_vocab
 | 
				
			||||||
            vocab = factory(self, **meta.get('vocab', {}))
 | 
					            vocab = factory(self, **meta.get('vocab', {}))
 | 
				
			||||||
 | 
					            if vocab.vectors.name is None:
 | 
				
			||||||
 | 
					                vocab.vectors.name = meta.get('vectors', {}).get('name')
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
        if make_doc is True:
 | 
					        if make_doc is True:
 | 
				
			||||||
            factory = self.Defaults.create_tokenizer
 | 
					            factory = self.Defaults.create_tokenizer
 | 
				
			||||||
| 
						 | 
					@ -158,7 +160,8 @@ class Language(object):
 | 
				
			||||||
        self._meta.setdefault('license', '')
 | 
					        self._meta.setdefault('license', '')
 | 
				
			||||||
        self._meta['vectors'] = {'width': self.vocab.vectors_length,
 | 
					        self._meta['vectors'] = {'width': self.vocab.vectors_length,
 | 
				
			||||||
                                 'vectors': len(self.vocab.vectors),
 | 
					                                 'vectors': len(self.vocab.vectors),
 | 
				
			||||||
                                 'keys': self.vocab.vectors.n_keys}
 | 
					                                 'keys': self.vocab.vectors.n_keys,
 | 
				
			||||||
 | 
					                                 'name': self.vocab.vectors.name}
 | 
				
			||||||
        self._meta['pipeline'] = self.pipe_names
 | 
					        self._meta['pipeline'] = self.pipe_names
 | 
				
			||||||
        return self._meta
 | 
					        return self._meta
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -457,6 +460,8 @@ class Language(object):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            device = None
 | 
					            device = None
 | 
				
			||||||
        link_vectors_to_models(self.vocab)
 | 
					        link_vectors_to_models(self.vocab)
 | 
				
			||||||
 | 
					        if self.vocab.vectors.data.shape[1]:
 | 
				
			||||||
 | 
					            cfg['pretrained_vectors'] = self.vocab.vectors.name
 | 
				
			||||||
        if sgd is None:
 | 
					        if sgd is None:
 | 
				
			||||||
            sgd = create_default_optimizer(Model.ops)
 | 
					            sgd = create_default_optimizer(Model.ops)
 | 
				
			||||||
        self._optimizer = sgd
 | 
					        self._optimizer = sgd
 | 
				
			||||||
| 
						 | 
					@ -629,6 +634,7 @@ class Language(object):
 | 
				
			||||||
            ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
 | 
					            ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
 | 
				
			||||||
            ('meta.json', lambda p: self.meta.update(util.read_json(p)))
 | 
					            ('meta.json', lambda p: self.meta.update(util.read_json(p)))
 | 
				
			||||||
        ))
 | 
					        ))
 | 
				
			||||||
 | 
					        _fix_pretrained_vectors_name(self)
 | 
				
			||||||
        for name, proc in self.pipeline:
 | 
					        for name, proc in self.pipeline:
 | 
				
			||||||
            if name in disable:
 | 
					            if name in disable:
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
| 
						 | 
					@ -674,6 +680,7 @@ class Language(object):
 | 
				
			||||||
            ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
 | 
					            ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
 | 
				
			||||||
            ('meta', lambda b: self.meta.update(ujson.loads(b)))
 | 
					            ('meta', lambda b: self.meta.update(ujson.loads(b)))
 | 
				
			||||||
        ))
 | 
					        ))
 | 
				
			||||||
 | 
					        _fix_pretrained_vectors_name(self)
 | 
				
			||||||
        for i, (name, proc) in enumerate(self.pipeline):
 | 
					        for i, (name, proc) in enumerate(self.pipeline):
 | 
				
			||||||
            if name in disable:
 | 
					            if name in disable:
 | 
				
			||||||
                continue
 | 
					                continue
 | 
				
			||||||
| 
						 | 
					@ -683,6 +690,25 @@ class Language(object):
 | 
				
			||||||
        msg = util.from_bytes(bytes_data, deserializers, {})
 | 
					        msg = util.from_bytes(bytes_data, deserializers, {})
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _fix_pretrained_vectors_name(nlp):
 | 
				
			||||||
 | 
					    # TODO: Replace this once we handle vectors consistently as static
 | 
				
			||||||
 | 
					    # data
 | 
				
			||||||
 | 
					    if 'vectors' in nlp.meta and nlp.meta['vectors'].get('name'):
 | 
				
			||||||
 | 
					        nlp.vocab.vectors.name = nlp.meta['vectors']['name']
 | 
				
			||||||
 | 
					    elif not nlp.vocab.vectors.size:
 | 
				
			||||||
 | 
					        nlp.vocab.vectors.name = None
 | 
				
			||||||
 | 
					    elif 'name' in nlp.meta and 'lang' in nlp.meta:
 | 
				
			||||||
 | 
					        vectors_name = '%s_%s.vectors' % (nlp.meta['lang'], nlp.meta['name'])
 | 
				
			||||||
 | 
					        nlp.vocab.vectors.name = vectors_name
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        raise ValueError("Unnamed vectors")
 | 
				
			||||||
 | 
					    for name, proc in nlp.pipeline:
 | 
				
			||||||
 | 
					        if not hasattr(proc, 'cfg'):
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if proc.cfg.get('pretrained_dims'):
 | 
				
			||||||
 | 
					            assert nlp.vocab.vectors.name
 | 
				
			||||||
 | 
					            proc.cfg['pretrained_vectors'] = nlp.vocab.vectors.name
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class DisabledPipes(list):
 | 
					class DisabledPipes(list):
 | 
				
			||||||
    """Manager for temporary pipeline disabling."""
 | 
					    """Manager for temporary pipeline disabling."""
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -202,8 +202,10 @@ class Pipe(object):
 | 
				
			||||||
    def from_bytes(self, bytes_data, **exclude):
 | 
					    def from_bytes(self, bytes_data, **exclude):
 | 
				
			||||||
        """Load the pipe from a bytestring."""
 | 
					        """Load the pipe from a bytestring."""
 | 
				
			||||||
        def load_model(b):
 | 
					        def load_model(b):
 | 
				
			||||||
 | 
					            # TODO: Remove this once we don't have to handle previous models
 | 
				
			||||||
 | 
					            if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
 | 
				
			||||||
 | 
					                self.cfg['pretrained_vectors'] = self.vocab.vectors.name 
 | 
				
			||||||
            if self.model is True:
 | 
					            if self.model is True:
 | 
				
			||||||
                self.cfg.setdefault('pretrained_dims', self.vocab.vectors_length)
 | 
					 | 
				
			||||||
                self.model = self.Model(**self.cfg)
 | 
					                self.model = self.Model(**self.cfg)
 | 
				
			||||||
            self.model.from_bytes(b)
 | 
					            self.model.from_bytes(b)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -227,8 +229,10 @@ class Pipe(object):
 | 
				
			||||||
    def from_disk(self, path, **exclude):
 | 
					    def from_disk(self, path, **exclude):
 | 
				
			||||||
        """Load the pipe from disk."""
 | 
					        """Load the pipe from disk."""
 | 
				
			||||||
        def load_model(p):
 | 
					        def load_model(p):
 | 
				
			||||||
 | 
					            # TODO: Remove this once we don't have to handle previous models
 | 
				
			||||||
 | 
					            if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
 | 
				
			||||||
 | 
					                self.cfg['pretrained_vectors'] = self.vocab.vectors.name 
 | 
				
			||||||
            if self.model is True:
 | 
					            if self.model is True:
 | 
				
			||||||
                self.cfg.setdefault('pretrained_dims', self.vocab.vectors_length)
 | 
					 | 
				
			||||||
                self.model = self.Model(**self.cfg)
 | 
					                self.model = self.Model(**self.cfg)
 | 
				
			||||||
            self.model.from_bytes(p.open('rb').read())
 | 
					            self.model.from_bytes(p.open('rb').read())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -286,7 +290,6 @@ class Tensorizer(Pipe):
 | 
				
			||||||
        self.model = model
 | 
					        self.model = model
 | 
				
			||||||
        self.input_models = []
 | 
					        self.input_models = []
 | 
				
			||||||
        self.cfg = dict(cfg)
 | 
					        self.cfg = dict(cfg)
 | 
				
			||||||
        self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
 | 
					 | 
				
			||||||
        self.cfg.setdefault('cnn_maxout_pieces', 3)
 | 
					        self.cfg.setdefault('cnn_maxout_pieces', 3)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, doc):
 | 
					    def __call__(self, doc):
 | 
				
			||||||
| 
						 | 
					@ -403,8 +406,6 @@ class Tagger(Pipe):
 | 
				
			||||||
        self.model = model
 | 
					        self.model = model
 | 
				
			||||||
        self.cfg = OrderedDict(sorted(cfg.items()))
 | 
					        self.cfg = OrderedDict(sorted(cfg.items()))
 | 
				
			||||||
        self.cfg.setdefault('cnn_maxout_pieces', 2)
 | 
					        self.cfg.setdefault('cnn_maxout_pieces', 2)
 | 
				
			||||||
        self.cfg.setdefault('pretrained_dims',
 | 
					 | 
				
			||||||
                            self.vocab.vectors.data.shape[1])
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def labels(self):
 | 
					    def labels(self):
 | 
				
			||||||
| 
						 | 
					@ -515,8 +516,8 @@ class Tagger(Pipe):
 | 
				
			||||||
            vocab.morphology = Morphology(vocab.strings, new_tag_map,
 | 
					            vocab.morphology = Morphology(vocab.strings, new_tag_map,
 | 
				
			||||||
                                          vocab.morphology.lemmatizer,
 | 
					                                          vocab.morphology.lemmatizer,
 | 
				
			||||||
                                          exc=vocab.morphology.exc)
 | 
					                                          exc=vocab.morphology.exc)
 | 
				
			||||||
 | 
					        self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
 | 
				
			||||||
        if self.model is True:
 | 
					        if self.model is True:
 | 
				
			||||||
            self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
 | 
					 | 
				
			||||||
            self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
 | 
					            self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
 | 
				
			||||||
        link_vectors_to_models(self.vocab)
 | 
					        link_vectors_to_models(self.vocab)
 | 
				
			||||||
        if sgd is None:
 | 
					        if sgd is None:
 | 
				
			||||||
| 
						 | 
					@ -525,6 +526,13 @@ class Tagger(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def Model(cls, n_tags, **cfg):
 | 
					    def Model(cls, n_tags, **cfg):
 | 
				
			||||||
 | 
					        if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
 | 
				
			||||||
 | 
					            raise ValueError(
 | 
				
			||||||
 | 
					                "Bad configuration of Tagger --- this is probably a bug "
 | 
				
			||||||
 | 
					                "within spaCy. We changed the name of an internal attribute "
 | 
				
			||||||
 | 
					                "for loading pre-trained vectors, and the class has been "
 | 
				
			||||||
 | 
					                "passed the old name (pretrained_dims) but not the new name "
 | 
				
			||||||
 | 
					                "(pretrained_vectors)")
 | 
				
			||||||
        return build_tagger_model(n_tags, **cfg)
 | 
					        return build_tagger_model(n_tags, **cfg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def add_label(self, label, values=None):
 | 
					    def add_label(self, label, values=None):
 | 
				
			||||||
| 
						 | 
					@ -572,6 +580,10 @@ class Tagger(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_bytes(self, bytes_data, **exclude):
 | 
					    def from_bytes(self, bytes_data, **exclude):
 | 
				
			||||||
        def load_model(b):
 | 
					        def load_model(b):
 | 
				
			||||||
 | 
					            # TODO: Remove this once we don't have to handle previous models
 | 
				
			||||||
 | 
					            if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
 | 
				
			||||||
 | 
					                self.cfg['pretrained_vectors'] = self.vocab.vectors.name 
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
            if self.model is True:
 | 
					            if self.model is True:
 | 
				
			||||||
                token_vector_width = util.env_opt(
 | 
					                token_vector_width = util.env_opt(
 | 
				
			||||||
                    'token_vector_width',
 | 
					                    'token_vector_width',
 | 
				
			||||||
| 
						 | 
					@ -597,7 +609,6 @@ class Tagger(Pipe):
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_disk(self, path, **exclude):
 | 
					    def to_disk(self, path, **exclude):
 | 
				
			||||||
        self.cfg.setdefault('pretrained_dims', self.vocab.vectors.data.shape[1])
 | 
					 | 
				
			||||||
        tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
 | 
					        tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
 | 
				
			||||||
        serialize = OrderedDict((
 | 
					        serialize = OrderedDict((
 | 
				
			||||||
            ('vocab', lambda p: self.vocab.to_disk(p)),
 | 
					            ('vocab', lambda p: self.vocab.to_disk(p)),
 | 
				
			||||||
| 
						 | 
					@ -610,6 +621,9 @@ class Tagger(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def from_disk(self, path, **exclude):
 | 
					    def from_disk(self, path, **exclude):
 | 
				
			||||||
        def load_model(p):
 | 
					        def load_model(p):
 | 
				
			||||||
 | 
					            # TODO: Remove this once we don't have to handle previous models
 | 
				
			||||||
 | 
					            if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
 | 
				
			||||||
 | 
					                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
 | 
				
			||||||
            if self.model is True:
 | 
					            if self.model is True:
 | 
				
			||||||
                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
 | 
					                self.model = self.Model(self.vocab.morphology.n_tags, **self.cfg)
 | 
				
			||||||
            with p.open('rb') as file_:
 | 
					            with p.open('rb') as file_:
 | 
				
			||||||
| 
						 | 
					@ -659,8 +673,6 @@ class MultitaskObjective(Tagger):
 | 
				
			||||||
                             "one of: dep, tag, ent, dep_tag_offset, ent_tag.")
 | 
					                             "one of: dep, tag, ent, dep_tag_offset, ent_tag.")
 | 
				
			||||||
        self.cfg = dict(cfg)
 | 
					        self.cfg = dict(cfg)
 | 
				
			||||||
        self.cfg.setdefault('cnn_maxout_pieces', 2)
 | 
					        self.cfg.setdefault('cnn_maxout_pieces', 2)
 | 
				
			||||||
        self.cfg.setdefault('pretrained_dims',
 | 
					 | 
				
			||||||
                            self.vocab.vectors.data.shape[1])
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
    def labels(self):
 | 
					    def labels(self):
 | 
				
			||||||
| 
						 | 
					@ -898,13 +910,15 @@ class TextCategorizer(Pipe):
 | 
				
			||||||
        self.labels.append(label)
 | 
					        self.labels.append(label)
 | 
				
			||||||
        return 1
 | 
					        return 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None):
 | 
					    def begin_training(self, gold_tuples=tuple(), pipeline=None, sgd=None,
 | 
				
			||||||
 | 
					                       **kwargs):
 | 
				
			||||||
        if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
 | 
					        if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
 | 
				
			||||||
            token_vector_width = pipeline[0].model.nO
 | 
					            token_vector_width = pipeline[0].model.nO
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            token_vector_width = 64
 | 
					            token_vector_width = 64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if self.model is True:
 | 
					        if self.model is True:
 | 
				
			||||||
            self.cfg['pretrained_dims'] = self.vocab.vectors_length
 | 
					            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
 | 
				
			||||||
            self.model = self.Model(len(self.labels), token_vector_width,
 | 
					            self.model = self.Model(len(self.labels), token_vector_width,
 | 
				
			||||||
                                    **self.cfg)
 | 
					                                    **self.cfg)
 | 
				
			||||||
            link_vectors_to_models(self.vocab)
 | 
					            link_vectors_to_models(self.vocab)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -255,8 +255,9 @@ cdef class Parser:
 | 
				
			||||||
            raise ValueError("Currently history size is hard-coded to 0")
 | 
					            raise ValueError("Currently history size is hard-coded to 0")
 | 
				
			||||||
        if hist_width != 0:
 | 
					        if hist_width != 0:
 | 
				
			||||||
            raise ValueError("Currently history width is hard-coded to 0")
 | 
					            raise ValueError("Currently history width is hard-coded to 0")
 | 
				
			||||||
 | 
					        pretrained_vectors = cfg.get('pretrained_vectors', None)
 | 
				
			||||||
        tok2vec = Tok2Vec(token_vector_width, embed_size,
 | 
					        tok2vec = Tok2Vec(token_vector_width, embed_size,
 | 
				
			||||||
                          pretrained_dims=cfg.get('pretrained_dims', 0))
 | 
					                          pretrained_vectors=pretrained_vectors)
 | 
				
			||||||
        tok2vec = chain(tok2vec, flatten)
 | 
					        tok2vec = chain(tok2vec, flatten)
 | 
				
			||||||
        lower = PrecomputableAffine(hidden_width,
 | 
					        lower = PrecomputableAffine(hidden_width,
 | 
				
			||||||
                    nF=cls.nr_feature, nI=token_vector_width,
 | 
					                    nF=cls.nr_feature, nI=token_vector_width,
 | 
				
			||||||
| 
						 | 
					@ -275,6 +276,7 @@ cdef class Parser:
 | 
				
			||||||
            'token_vector_width': token_vector_width,
 | 
					            'token_vector_width': token_vector_width,
 | 
				
			||||||
            'hidden_width': hidden_width,
 | 
					            'hidden_width': hidden_width,
 | 
				
			||||||
            'maxout_pieces': parser_maxout_pieces,
 | 
					            'maxout_pieces': parser_maxout_pieces,
 | 
				
			||||||
 | 
					            'pretrained_vectors': pretrained_vectors,
 | 
				
			||||||
            'hist_size': hist_size,
 | 
					            'hist_size': hist_size,
 | 
				
			||||||
            'hist_width': hist_width
 | 
					            'hist_width': hist_width
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
| 
						 | 
					@ -294,9 +296,9 @@ cdef class Parser:
 | 
				
			||||||
            unless True (default), in which case a new instance is created with
 | 
					            unless True (default), in which case a new instance is created with
 | 
				
			||||||
            `Parser.Moves()`.
 | 
					            `Parser.Moves()`.
 | 
				
			||||||
        model (object): Defines how the parse-state is created, updated and
 | 
					        model (object): Defines how the parse-state is created, updated and
 | 
				
			||||||
            evaluated. The value is set to the .model attribute unless True
 | 
					            evaluated. The value is set to the .model attribute. If set to True
 | 
				
			||||||
            (default), in which case a new instance is created with
 | 
					            (default), a new instance will be created with `Parser.Model()`
 | 
				
			||||||
            `Parser.Model()`.
 | 
					            in parser.begin_training(), parser.from_disk() or parser.from_bytes().
 | 
				
			||||||
        **cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute
 | 
					        **cfg: Arbitrary configuration parameters. Set to the `.cfg` attribute
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
| 
						 | 
					@ -308,8 +310,6 @@ cdef class Parser:
 | 
				
			||||||
            cfg['beam_width'] = util.env_opt('beam_width', 1)
 | 
					            cfg['beam_width'] = util.env_opt('beam_width', 1)
 | 
				
			||||||
        if 'beam_density' not in cfg:
 | 
					        if 'beam_density' not in cfg:
 | 
				
			||||||
            cfg['beam_density'] = util.env_opt('beam_density', 0.0)
 | 
					            cfg['beam_density'] = util.env_opt('beam_density', 0.0)
 | 
				
			||||||
        if 'pretrained_dims' not in cfg:
 | 
					 | 
				
			||||||
            cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
 | 
					 | 
				
			||||||
        cfg.setdefault('cnn_maxout_pieces', 3)
 | 
					        cfg.setdefault('cnn_maxout_pieces', 3)
 | 
				
			||||||
        self.cfg = cfg
 | 
					        self.cfg = cfg
 | 
				
			||||||
        if 'actions' in self.cfg:
 | 
					        if 'actions' in self.cfg:
 | 
				
			||||||
| 
						 | 
					@ -832,7 +832,6 @@ cdef class Parser:
 | 
				
			||||||
                self.moves.add_action(action, label)
 | 
					                self.moves.add_action(action, label)
 | 
				
			||||||
        cfg.setdefault('token_vector_width', 128)
 | 
					        cfg.setdefault('token_vector_width', 128)
 | 
				
			||||||
        if self.model is True:
 | 
					        if self.model is True:
 | 
				
			||||||
            cfg['pretrained_dims'] = self.vocab.vectors_length
 | 
					 | 
				
			||||||
            self.model, cfg = self.Model(self.moves.n_moves, **cfg)
 | 
					            self.model, cfg = self.Model(self.moves.n_moves, **cfg)
 | 
				
			||||||
            if sgd is None:
 | 
					            if sgd is None:
 | 
				
			||||||
                sgd = self.create_optimizer()
 | 
					                sgd = self.create_optimizer()
 | 
				
			||||||
| 
						 | 
					@ -896,9 +895,11 @@ cdef class Parser:
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        util.from_disk(path, deserializers, exclude)
 | 
					        util.from_disk(path, deserializers, exclude)
 | 
				
			||||||
        if 'model' not in exclude:
 | 
					        if 'model' not in exclude:
 | 
				
			||||||
 | 
					            # TODO: Remove this once we don't have to handle previous models
 | 
				
			||||||
 | 
					            if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
 | 
				
			||||||
 | 
					                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
 | 
				
			||||||
            path = util.ensure_path(path)
 | 
					            path = util.ensure_path(path)
 | 
				
			||||||
            if self.model is True:
 | 
					            if self.model is True:
 | 
				
			||||||
                self.cfg.setdefault('pretrained_dims', self.vocab.vectors_length)
 | 
					 | 
				
			||||||
                self.model, cfg = self.Model(**self.cfg)
 | 
					                self.model, cfg = self.Model(**self.cfg)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                cfg = {}
 | 
					                cfg = {}
 | 
				
			||||||
| 
						 | 
					@ -941,12 +942,13 @@ cdef class Parser:
 | 
				
			||||||
        ))
 | 
					        ))
 | 
				
			||||||
        msg = util.from_bytes(bytes_data, deserializers, exclude)
 | 
					        msg = util.from_bytes(bytes_data, deserializers, exclude)
 | 
				
			||||||
        if 'model' not in exclude:
 | 
					        if 'model' not in exclude:
 | 
				
			||||||
 | 
					            # TODO: Remove this once we don't have to handle previous models
 | 
				
			||||||
 | 
					            if 'pretrained_dims' in self.cfg and 'pretrained_vectors' not in self.cfg:
 | 
				
			||||||
 | 
					                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
 | 
				
			||||||
            if self.model is True:
 | 
					            if self.model is True:
 | 
				
			||||||
                self.model, cfg = self.Model(**self.cfg)
 | 
					                self.model, cfg = self.Model(**self.cfg)
 | 
				
			||||||
                cfg['pretrained_dims'] = self.vocab.vectors_length
 | 
					 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                cfg = {}
 | 
					                cfg = {}
 | 
				
			||||||
            cfg['pretrained_dims'] = self.vocab.vectors_length
 | 
					 | 
				
			||||||
            if 'tok2vec_model' in msg:
 | 
					            if 'tok2vec_model' in msg:
 | 
				
			||||||
                self.model[0].from_bytes(msg['tok2vec_model'])
 | 
					                self.model[0].from_bytes(msg['tok2vec_model'])
 | 
				
			||||||
            if 'lower_model' in msg:
 | 
					            if 'lower_model' in msg:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -19,7 +19,9 @@ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
 | 
				
			||||||
_models = {'en': ['en_core_web_sm'],
 | 
					_models = {'en': ['en_core_web_sm'],
 | 
				
			||||||
           'de': ['de_core_news_md'],
 | 
					           'de': ['de_core_news_md'],
 | 
				
			||||||
           'fr': ['fr_core_news_sm'],
 | 
					           'fr': ['fr_core_news_sm'],
 | 
				
			||||||
           'xx': ['xx_ent_web_md']}
 | 
					           'xx': ['xx_ent_web_md'],
 | 
				
			||||||
 | 
					           'en_core_web_md': ['en_core_web_md'],
 | 
				
			||||||
 | 
					           'es_core_news_md': ['es_core_news_md']}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# only used for tests that require loading the models
 | 
					# only used for tests that require loading the models
 | 
				
			||||||
| 
						 | 
					@ -183,6 +185,9 @@ def pytest_addoption(parser):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for lang in _languages + ['all']:
 | 
					    for lang in _languages + ['all']:
 | 
				
			||||||
        parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang)
 | 
					        parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang)
 | 
				
			||||||
 | 
					    for model in _models:
 | 
				
			||||||
 | 
					        if model not in _languages:
 | 
				
			||||||
 | 
					            parser.addoption("--%s" % model, action="store_true", help="Use %s model" % model)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def pytest_runtest_setup(item):
 | 
					def pytest_runtest_setup(item):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										12
									
								
								spacy/tests/regression/test_issue1660.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								spacy/tests/regression/test_issue1660.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,12 @@
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					from ...util import load_model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models("en_core_web_md")
 | 
				
			||||||
 | 
					@pytest.mark.models("es_core_news_md")
 | 
				
			||||||
 | 
					def test_models_with_different_vectors():
 | 
				
			||||||
 | 
					    nlp = load_model('en_core_web_md')
 | 
				
			||||||
 | 
					    doc = nlp(u'hello world')
 | 
				
			||||||
 | 
					    nlp2 = load_model('es_core_news_md')
 | 
				
			||||||
 | 
					    doc2 = nlp2(u'hola')
 | 
				
			||||||
 | 
					    doc = nlp(u'hello world')
 | 
				
			||||||
| 
						 | 
					@ -17,6 +17,7 @@ def meta_data():
 | 
				
			||||||
        'email': 'email-in-fixture',
 | 
					        'email': 'email-in-fixture',
 | 
				
			||||||
        'url': 'url-in-fixture',
 | 
					        'url': 'url-in-fixture',
 | 
				
			||||||
        'license': 'license-in-fixture',
 | 
					        'license': 'license-in-fixture',
 | 
				
			||||||
 | 
					        'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None}
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,7 @@
 | 
				
			||||||
# coding: utf8
 | 
					# coding: utf8
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import functools
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
from collections import OrderedDict
 | 
					from collections import OrderedDict
 | 
				
			||||||
import msgpack
 | 
					import msgpack
 | 
				
			||||||
| 
						 | 
					@ -19,6 +20,20 @@ def unpickle_vectors(bytes_data):
 | 
				
			||||||
    return Vectors().from_bytes(bytes_data)
 | 
					    return Vectors().from_bytes(bytes_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class GlobalRegistry(object):
 | 
				
			||||||
 | 
					    '''Global store of vectors, to avoid repeatedly loading the data.'''
 | 
				
			||||||
 | 
					    data = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def register(cls, name, data):
 | 
				
			||||||
 | 
					        cls.data[name] = data
 | 
				
			||||||
 | 
					        return functools.partial(cls.get, name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @classmethod
 | 
				
			||||||
 | 
					    def get(cls, name):
 | 
				
			||||||
 | 
					        return cls.data[name]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Vectors:
 | 
					cdef class Vectors:
 | 
				
			||||||
    """Store, save and load word vectors.
 | 
					    """Store, save and load word vectors.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -31,18 +46,21 @@ cdef class Vectors:
 | 
				
			||||||
    the table need to be assigned --- so len(list(vectors.keys())) may be
 | 
					    the table need to be assigned --- so len(list(vectors.keys())) may be
 | 
				
			||||||
    greater or smaller than vectors.shape[0].
 | 
					    greater or smaller than vectors.shape[0].
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    cdef public object name
 | 
				
			||||||
    cdef public object data
 | 
					    cdef public object data
 | 
				
			||||||
    cdef public object key2row
 | 
					    cdef public object key2row
 | 
				
			||||||
    cdef public object _unset
 | 
					    cdef public object _unset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, *, shape=None, data=None, keys=None):
 | 
					    def __init__(self, *, shape=None, data=None, keys=None, name=None):
 | 
				
			||||||
        """Create a new vector store.
 | 
					        """Create a new vector store.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        shape (tuple): Size of the table, as (# entries, # columns)
 | 
					        shape (tuple): Size of the table, as (# entries, # columns)
 | 
				
			||||||
        data (numpy.ndarray): The vector data.
 | 
					        data (numpy.ndarray): The vector data.
 | 
				
			||||||
        keys (iterable): A sequence of keys, aligned with the data.
 | 
					        keys (iterable): A sequence of keys, aligned with the data.
 | 
				
			||||||
 | 
					        name (string): A name to identify the vectors table.
 | 
				
			||||||
        RETURNS (Vectors): The newly created object.
 | 
					        RETURNS (Vectors): The newly created object.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					        self.name = name
 | 
				
			||||||
        if data is None:
 | 
					        if data is None:
 | 
				
			||||||
            if shape is None:
 | 
					            if shape is None:
 | 
				
			||||||
                shape = (0,0)
 | 
					                shape = (0,0)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -381,6 +381,7 @@ cdef class Vocab:
 | 
				
			||||||
            self.lexemes_from_bytes(file_.read())
 | 
					            self.lexemes_from_bytes(file_.read())
 | 
				
			||||||
        if self.vectors is not None:
 | 
					        if self.vectors is not None:
 | 
				
			||||||
            self.vectors.from_disk(path, exclude='strings.json')
 | 
					            self.vectors.from_disk(path, exclude='strings.json')
 | 
				
			||||||
 | 
					        if self.vectors.name is not None:
 | 
				
			||||||
            link_vectors_to_models(self)
 | 
					            link_vectors_to_models(self)
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -421,6 +422,8 @@ cdef class Vocab:
 | 
				
			||||||
            ('vectors', lambda b: serialize_vectors(b))
 | 
					            ('vectors', lambda b: serialize_vectors(b))
 | 
				
			||||||
        ))
 | 
					        ))
 | 
				
			||||||
        util.from_bytes(bytes_data, setters, exclude)
 | 
					        util.from_bytes(bytes_data, setters, exclude)
 | 
				
			||||||
 | 
					        if self.vectors.name is not None:
 | 
				
			||||||
 | 
					            link_vectors_to_models(self)
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lexemes_to_bytes(self):
 | 
					    def lexemes_to_bytes(self):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user