mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Add vectors option to CharacterEmbed (#6069)
* Add vectors option to CharacterEmbed * Update spacy/pipeline/morphologizer.pyx * Adjust default morphologizer config Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
		
							parent
							
								
									d722a439aa
								
							
						
					
					
						commit
						f3db3f6fe0
					
				| 
						 | 
				
			
			@ -164,7 +164,7 @@ def MultiHashEmbed(
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
@registry.architectures.register("spacy.CharacterEmbed.v1")
 | 
			
		||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
 | 
			
		||||
def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
 | 
			
		||||
    """Construct an embedded representation based on character embeddings, using
 | 
			
		||||
    a feed-forward network. A fixed number of UTF-8 byte characters are used for
 | 
			
		||||
    each word, taken from the beginning and end of the word equally. Padding is
 | 
			
		||||
| 
						 | 
				
			
			@ -188,7 +188,24 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
 | 
			
		|||
    nC (int): The number of UTF-8 bytes to embed per word. Recommended values
 | 
			
		||||
        are between 3 and 8, although it may depend on the length of words in the
 | 
			
		||||
        language.
 | 
			
		||||
    also_use_static_vectors (bool): Whether to also use static word vectors.
 | 
			
		||||
        Requires a vectors table to be loaded in the Doc objects' vocab.
 | 
			
		||||
    """
 | 
			
		||||
    if also_use_static_vectors:
 | 
			
		||||
        model = chain(
 | 
			
		||||
            concatenate(
 | 
			
		||||
                chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
 | 
			
		||||
                chain(
 | 
			
		||||
                    FeatureExtractor([NORM]),
 | 
			
		||||
                    list2ragged(),
 | 
			
		||||
                    with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
 | 
			
		||||
                ),
 | 
			
		||||
                StaticVectors(width, dropout=0.0),
 | 
			
		||||
            ),
 | 
			
		||||
            with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
 | 
			
		||||
            ragged2list(),
 | 
			
		||||
    )
 | 
			
		||||
    else:
 | 
			
		||||
        model = chain(
 | 
			
		||||
            concatenate(
 | 
			
		||||
                chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -32,6 +32,7 @@ width = 128
 | 
			
		|||
rows = 7000
 | 
			
		||||
nM = 64
 | 
			
		||||
nC = 8
 | 
			
		||||
also_use_static_vectors = false
 | 
			
		||||
 | 
			
		||||
[model.tok2vec.encode]
 | 
			
		||||
@architectures = "spacy.MaxoutWindowEncoder.v1"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
 | 
			
		|||
    [
 | 
			
		||||
        (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
 | 
			
		||||
        (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
 | 
			
		||||
        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
 | 
			
		||||
        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
 | 
			
		||||
        (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
 | 
			
		||||
        (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
# fmt: on
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user