mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						c27fdaef6f
					
				|  | @ -17,9 +17,11 @@ | |||
|             "Span": "span", | ||||
|             "Language": "language", | ||||
|             "Tokenizer": "tokenizer", | ||||
|             "Tensorizer": "tensorizer", | ||||
|             "Tagger": "tagger", | ||||
|             "DependencyParser": "dependencyparser", | ||||
|             "EntityRecognizer": "entityrecognizer", | ||||
|             "TextCategorizer": "textcategorizer", | ||||
|             "Matcher": "matcher", | ||||
|             "Lexeme": "lexeme", | ||||
|             "Vocab": "vocab", | ||||
|  | @ -129,6 +131,12 @@ | |||
|         "source": "spacy/pipeline.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "textcategorizer": { | ||||
|         "title": "TextCategorizer", | ||||
|         "tag": "class", | ||||
|         "source": "spacy/pipeline.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "dependencyparser": { | ||||
|         "title": "DependencyParser", | ||||
|         "tag": "class", | ||||
|  | @ -147,6 +155,12 @@ | |||
|         "source": "spacy/pipeline.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "tensorizer": { | ||||
|         "title": "Tensorizer", | ||||
|         "tag": "class", | ||||
|         "source": "spacy/pipeline.pyx" | ||||
|     }, | ||||
| 
 | ||||
|     "goldparse": { | ||||
|         "title": "GoldParse", | ||||
|         "tag": "class", | ||||
|  |  | |||
|  | @ -40,7 +40,7 @@ p | |||
| +h(2, "pos-tagging") Part-of-speech Tagging | ||||
| 
 | ||||
| +aside("Tip: Understanding tags") | ||||
|     |  You can also use #[code spacy.explain()] to get the escription for the | ||||
|     |  You can also use #[code spacy.explain()] to get the description for the | ||||
|     |  string representation of a tag. For example, | ||||
|     |  #[code spacy.explain("RB")] will return "adverb". | ||||
| 
 | ||||
|  |  | |||
|  | @ -558,10 +558,20 @@ p | |||
|         +cell The store of lexical types. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code tensor] | ||||
|         +cell #[code tensor] #[+tag-new(2)] | ||||
|         +cell object | ||||
|         +cell Container for dense vector representations. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code cats] #[+tag-new(2)] | ||||
|         +cell dictionary | ||||
|         +cell | ||||
|             |  Maps either a label to a score for categories applied to whole | ||||
|             |  document, or #[code (start_char, end_char, label)] to score for | ||||
|             |  categories applied to spans. #[code start_char] and #[code end_char] | ||||
|             |  should be character offsets, label can be either a string or an | ||||
|             |  integer ID, and score should be a float. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code user_data] | ||||
|         +cell - | ||||
|  |  | |||
|  | @ -103,6 +103,14 @@ p | |||
|         +cell list | ||||
|         +cell The alignment from gold tokenization to candidate tokenization. | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[code cats] #[+tag-new(2)] | ||||
|         +cell list | ||||
|         +cell | ||||
|             |  Entries in the list should be either a label, or a | ||||
|             |  #[code (start, end, label)] triple. The tuple form is used for | ||||
|             |  categories applied to spans of the document. | ||||
| 
 | ||||
| 
 | ||||
| +h(2, "util") Utilities | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										7
									
								
								website/docs/api/tensorizer.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								website/docs/api/tensorizer.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,7 @@ | |||
| //- 💫 DOCS > API > TENSORIZER | ||||
| 
 | ||||
| include ../../_includes/_mixins | ||||
| 
 | ||||
| p Add a tensor with position-sensitive meaning representations to a #[code Doc]. | ||||
| 
 | ||||
| +under-construction | ||||
							
								
								
									
										21
									
								
								website/docs/api/textcategorizer.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								website/docs/api/textcategorizer.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,21 @@ | |||
| //- 💫 DOCS > API > TEXTCATEGORIZER | ||||
| 
 | ||||
| include ../../_includes/_mixins | ||||
| 
 | ||||
| p | ||||
|     |  Add text categorization models to spaCy pipelines. The model supports | ||||
|     |  classification with multiple, non-mutually exclusive labels. | ||||
| 
 | ||||
| p | ||||
|     |  You can change the model architecture rather easily, but by default, the | ||||
|     |  #[code TextCategorizer] class uses a convolutional neural network to | ||||
|     |  assign position-sensitive vectors to each word in the document. This step | ||||
|     |  is similar to the #[+api("tensorizer") #[code Tensorizer]] component, but the | ||||
|     |  #[code TextCategorizer] uses its own CNN model, to avoid sharing weights | ||||
|     |  with the other pipeline components. The document tensor is then | ||||
|     |  summarized by concatenating max and mean pooling, and a multilayer | ||||
|     |  perceptron is used to predict an output vector of length #[code nr_class], | ||||
|     |  before a logistic activation is applied elementwise. The value of each | ||||
|     |  output neuron is the probability that some class is present. | ||||
| 
 | ||||
| +under-construction | ||||
|  | @ -16,6 +16,7 @@ | |||
|             "Rule-based matching": "rule-based-matching", | ||||
|             "Adding languages": "adding-languages", | ||||
|             "Processing pipelines": "language-processing-pipeline", | ||||
|             "Text classification": "text-classification", | ||||
|             "Deep learning": "deep-learning", | ||||
|             "Production use": "production-use", | ||||
|             "Training": "training", | ||||
|  | @ -106,6 +107,11 @@ | |||
|         "next": "production use" | ||||
|     }, | ||||
| 
 | ||||
|     "text-classification": { | ||||
|         "title": "Text classification", | ||||
|         "next": "training" | ||||
|     }, | ||||
| 
 | ||||
|     "production-use": { | ||||
|         "title": "Production use", | ||||
|         "next": "training" | ||||
|  |  | |||
|  | @ -129,13 +129,6 @@ p | |||
|             |  locations. | ||||
|         +cell #[+procon("pro")] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[strong Rule-based Matching] | ||||
|         +cell | ||||
|             |  Finding sequences of tokens based on their texts and linguistic | ||||
|             |  annotations, similar to regular expressions. | ||||
|         +cell #[+procon("con")] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[strong Similarity] | ||||
|         +cell | ||||
|  | @ -143,6 +136,18 @@ p | |||
|             |  are to each other. | ||||
|         +cell #[+procon("pro")] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[strong Text classification] | ||||
|         +cell Assigning categories or labels to a whole document, or parts of a document. | ||||
|         +cell #[+procon("pro")] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[strong Rule-based Matching] | ||||
|         +cell | ||||
|             |  Finding sequences of tokens based on their texts and linguistic | ||||
|             |  annotations, similar to regular expressions. | ||||
|         +cell #[+procon("con")] | ||||
| 
 | ||||
|     +row | ||||
|         +cell #[strong Training] | ||||
|         +cell Updating and improving a statistical model's predictions. | ||||
|  |  | |||
							
								
								
									
										5
									
								
								website/docs/usage/text-classification.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								website/docs/usage/text-classification.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,5 @@ | |||
| //- 💫 DOCS > USAGE > TEXT CLASSIFICATION | ||||
| 
 | ||||
| include ../../_includes/_mixins | ||||
| 
 | ||||
| +under-construction | ||||
|  | @ -38,6 +38,7 @@ p | |||
|         +item #[+a("#summary") Summary] | ||||
|         +item #[+a("#features") New features] | ||||
|         +item #[+a("#features-pipelines") Improved processing pipelines] | ||||
|         +item #[+a("#features-text-classification") Text classification] | ||||
|         +item #[+a("#features-hash-ids") Hash values instead of integer IDs] | ||||
|         +item #[+a("#features-serializer") Saving, loading and serialization] | ||||
|         +item #[+a("#features-displacy") displaCy visualizer] | ||||
|  | @ -102,6 +103,26 @@ p | |||
|     |  #[strong API:] #[+api("language") #[code Language]] | ||||
|     |  #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] | ||||
| 
 | ||||
| +h(3, "features-text-classification") Text classification | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|     from spacy.lang.en import English | ||||
|     nlp = English(pipeline=['tensorizer', 'tagger', 'textcat']) | ||||
| 
 | ||||
| p | ||||
|     |  spaCy v2.0 lets you add text categorization models to spaCy pipelines. | ||||
|     |  The model supports classification with multiple, non-mutually exclusive | ||||
|     |  labels – so multiple labels can apply at once. You can change the model | ||||
|     |  architecture rather easily, but by default, the #[code TextCategorizer] | ||||
|     |  class uses a convolutional neural network to assign position-sensitive | ||||
|     |  vectors to each word in the document. | ||||
| 
 | ||||
| +infobox | ||||
|     |  #[strong API:] #[+api("textcategorizer") #[code TextCategorizer]], | ||||
|     |  #[+api("doc#attributes") #[code Doc.cats]], | ||||
|     |  #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br] | ||||
|     |  #[strong Usage:] #[+a("/docs/usage/text-classification") Text classification] | ||||
| 
 | ||||
| +h(3, "features-hash-ids") Hash values instead of integer IDs | ||||
| 
 | ||||
| +aside-code("Example"). | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user