mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Add FeatureExtractor from Thinc (#6170)
* move featureextractor from Thinc * Update website/docs/api/architectures.md Co-authored-by: Ines Montani <ines@ines.io> * Update website/docs/api/architectures.md Co-authored-by: Ines Montani <ines@ines.io> Co-authored-by: Ines Montani <ines@ines.io>
This commit is contained in:
		
							parent
							
								
									73538782a0
								
							
						
					
					
						commit
						a22215f427
					
				
							
								
								
									
										25
									
								
								spacy/ml/featureextractor.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								spacy/ml/featureextractor.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,25 @@
 | 
				
			||||||
 | 
					from typing import List, Union, Callable, Tuple
 | 
				
			||||||
 | 
					from thinc.types import Ints2d, Doc
 | 
				
			||||||
 | 
					from thinc.api import Model, registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@registry.layers("spacy.FeatureExtractor.v1")
 | 
				
			||||||
 | 
					def FeatureExtractor(columns: List[Union[int, str]]) -> Model[List[Doc], List[Ints2d]]:
 | 
				
			||||||
 | 
					    return Model("extract_features", forward, attrs={"columns": columns})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def forward(model: Model[List[Doc], List[Ints2d]], docs, is_train: bool) -> Tuple[List[Ints2d], Callable]:
 | 
				
			||||||
 | 
					    columns = model.attrs["columns"]
 | 
				
			||||||
 | 
					    features: List[Ints2d] = []
 | 
				
			||||||
 | 
					    for doc in docs:
 | 
				
			||||||
 | 
					        if hasattr(doc, "to_array"):
 | 
				
			||||||
 | 
					            attrs = doc.to_array(columns)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            attrs = doc.doc.to_array(columns)[doc.start : doc.end]
 | 
				
			||||||
 | 
					        if attrs.ndim == 1:
 | 
				
			||||||
 | 
					            attrs = attrs.reshape((attrs.shape[0], 1))
 | 
				
			||||||
 | 
					        features.append(model.ops.asarray2i(attrs, dtype="uint64"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    backprop: Callable[[List[Ints2d]], List] = lambda d_features: []
 | 
				
			||||||
 | 
					    return features, backprop
 | 
				
			||||||
| 
						 | 
					@ -3,12 +3,13 @@ from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 | 
				
			||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 | 
					from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 | 
				
			||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
 | 
					from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
 | 
				
			||||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
 | 
					from thinc.api import HashEmbed, with_array, with_cpu, uniqued
 | 
				
			||||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
 | 
					from thinc.api import Relu, residual, expand_window
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
 | 
					from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
 | 
				
			||||||
from ...util import registry
 | 
					from ...util import registry
 | 
				
			||||||
from ..extract_ngrams import extract_ngrams
 | 
					from ..extract_ngrams import extract_ngrams
 | 
				
			||||||
from ..staticvectors import StaticVectors
 | 
					from ..staticvectors import StaticVectors
 | 
				
			||||||
 | 
					from ..featureextractor import FeatureExtractor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.architectures.register("spacy.TextCatCNN.v1")
 | 
					@registry.architectures.register("spacy.TextCatCNN.v1")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,14 @@
 | 
				
			||||||
from typing import Optional, List
 | 
					from typing import Optional, List
 | 
				
			||||||
from thinc.api import chain, clone, concatenate, with_array, with_padded
 | 
					 | 
				
			||||||
from thinc.api import Model, noop, list2ragged, ragged2list
 | 
					 | 
				
			||||||
from thinc.api import FeatureExtractor, HashEmbed
 | 
					 | 
				
			||||||
from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
 | 
					 | 
				
			||||||
from thinc.types import Floats2d
 | 
					from thinc.types import Floats2d
 | 
				
			||||||
 | 
					from thinc.api import chain, clone, concatenate, with_array, with_padded
 | 
				
			||||||
 | 
					from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
 | 
				
			||||||
 | 
					from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
from ...util import registry
 | 
					from ...util import registry
 | 
				
			||||||
from ...ml import _character_embed
 | 
					from ...ml import _character_embed
 | 
				
			||||||
from ..staticvectors import StaticVectors
 | 
					from ..staticvectors import StaticVectors
 | 
				
			||||||
 | 
					from ..featureextractor import FeatureExtractor
 | 
				
			||||||
from ...pipeline.tok2vec import Tok2VecListener
 | 
					from ...pipeline.tok2vec import Tok2VecListener
 | 
				
			||||||
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
 | 
					from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -144,9 +144,9 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 | 
				
			||||||
Construct an embedding layer that separately embeds a number of lexical
 | 
					Construct an embedding layer that separately embeds a number of lexical
 | 
				
			||||||
attributes using hash embedding, concatenates the results, and passes it through
 | 
					attributes using hash embedding, concatenates the results, and passes it through
 | 
				
			||||||
a feed-forward subnetwork to build mixed representations. The features used are
 | 
					a feed-forward subnetwork to build mixed representations. The features used are
 | 
				
			||||||
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
 | 
					the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a
 | 
				
			||||||
depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
 | 
					[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static
 | 
				
			||||||
static vectors can also be incorporated into the concatenated representation.
 | 
					vectors can also be incorporated into the concatenated representation.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                      | Description                                                                                                                                                                                                       |
 | 
					| Name                      | Description                                                                                                                                                                                                       |
 | 
				
			||||||
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
				
			||||||
| 
						 | 
					@ -291,6 +291,24 @@ on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
 | 
				
			||||||
| `key_attr`  | Defaults to `"ORTH"`. ~~str~~                                                                                                                                                                                           |
 | 
					| `key_attr`  | Defaults to `"ORTH"`. ~~str~~                                                                                                                                                                                           |
 | 
				
			||||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~                                                                                                                                                          |
 | 
					| **CREATES** | The model using the architecture. ~~Model[List[Doc], Ragged]~~                                                                                                                                                          |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### spacy.FeatureExtractor.v1 {#FeatureExtractor}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					> #### Example config
 | 
				
			||||||
 | 
					>
 | 
				
			||||||
 | 
					> ```ini
 | 
				
			||||||
 | 
					> [model]
 | 
				
			||||||
 | 
					> @architectures = "spacy.FeatureExtractor.v1"
 | 
				
			||||||
 | 
					> columns = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
 | 
				
			||||||
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
 | 
				
			||||||
 | 
					of feature names to extract, which should refer to token attributes.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Name        |  Description                                                             |
 | 
				
			||||||
 | 
					| ----------- | ------------------------------------------------------------------------ |
 | 
				
			||||||
 | 
					| `columns`   | The token attributes to extract. ~~List[Union[int, str]]~~               |
 | 
				
			||||||
 | 
					| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
 | 
					## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The following architectures are provided by the package
 | 
					The following architectures are provided by the package
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -585,8 +585,9 @@ vectors, but combines them via summation with a smaller table of learned
 | 
				
			||||||
embeddings.
 | 
					embeddings.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```python
 | 
					```python
 | 
				
			||||||
from thinc.api import add, chain, remap_ids, Embed, FeatureExtractor
 | 
					from thinc.api import add, chain, remap_ids, Embed
 | 
				
			||||||
from spacy.ml.staticvectors import StaticVectors
 | 
					from spacy.ml.staticvectors import StaticVectors
 | 
				
			||||||
 | 
					from spacy.ml.featureextractor import FeatureExtractor
 | 
				
			||||||
from spacy.util import registry
 | 
					from spacy.util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@registry.architectures("my_example.MyEmbedding.v1")
 | 
					@registry.architectures("my_example.MyEmbedding.v1")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user