mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 00:04:15 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
c27fdaef6f
|
@ -17,9 +17,11 @@
|
|||
"Span": "span",
|
||||
"Language": "language",
|
||||
"Tokenizer": "tokenizer",
|
||||
"Tensorizer": "tensorizer",
|
||||
"Tagger": "tagger",
|
||||
"DependencyParser": "dependencyparser",
|
||||
"EntityRecognizer": "entityrecognizer",
|
||||
"TextCategorizer": "textcategorizer",
|
||||
"Matcher": "matcher",
|
||||
"Lexeme": "lexeme",
|
||||
"Vocab": "vocab",
|
||||
|
@ -129,6 +131,12 @@
|
|||
"source": "spacy/pipeline.pyx"
|
||||
},
|
||||
|
||||
"textcategorizer": {
|
||||
"title": "TextCategorizer",
|
||||
"tag": "class",
|
||||
"source": "spacy/pipeline.pyx"
|
||||
},
|
||||
|
||||
"dependencyparser": {
|
||||
"title": "DependencyParser",
|
||||
"tag": "class",
|
||||
|
@ -147,6 +155,12 @@
|
|||
"source": "spacy/pipeline.pyx"
|
||||
},
|
||||
|
||||
"tensorizer": {
|
||||
"title": "Tensorizer",
|
||||
"tag": "class",
|
||||
"source": "spacy/pipeline.pyx"
|
||||
},
|
||||
|
||||
"goldparse": {
|
||||
"title": "GoldParse",
|
||||
"tag": "class",
|
||||
|
|
|
@ -40,7 +40,7 @@ p
|
|||
+h(2, "pos-tagging") Part-of-speech Tagging
|
||||
|
||||
+aside("Tip: Understanding tags")
|
||||
| You can also use #[code spacy.explain()] to get the escription for the
|
||||
| You can also use #[code spacy.explain()] to get the description for the
|
||||
| string representation of a tag. For example,
|
||||
| #[code spacy.explain("RB")] will return "adverb".
|
||||
|
||||
|
|
|
@ -558,10 +558,20 @@ p
|
|||
+cell The store of lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code tensor]
|
||||
+cell #[code tensor] #[+tag-new(2)]
|
||||
+cell object
|
||||
+cell Container for dense vector representations.
|
||||
|
||||
+row
|
||||
+cell #[code cats] #[+tag-new(2)]
|
||||
+cell dictionary
|
||||
+cell
|
||||
| Maps either a label to a score for categories applied to whole
|
||||
| document, or #[code (start_char, end_char, label)] to score for
|
||||
| categories applied to spans. #[code start_char] and #[code end_char]
|
||||
| should be character offsets, label can be either a string or an
|
||||
| integer ID, and score should be a float.
|
||||
|
||||
+row
|
||||
+cell #[code user_data]
|
||||
+cell -
|
||||
|
|
|
@ -103,6 +103,14 @@ p
|
|||
+cell list
|
||||
+cell The alignment from gold tokenization to candidate tokenization.
|
||||
|
||||
+row
|
||||
+cell #[code cats] #[+tag-new(2)]
|
||||
+cell list
|
||||
+cell
|
||||
| Entries in the list should be either a label, or a
|
||||
| #[code (start, end, label)] triple. The tuple form is used for
|
||||
| categories applied to spans of the document.
|
||||
|
||||
|
||||
+h(2, "util") Utilities
|
||||
|
||||
|
|
7
website/docs/api/tensorizer.jade
Normal file
7
website/docs/api/tensorizer.jade
Normal file
|
@ -0,0 +1,7 @@
|
|||
//- 💫 DOCS > API > TENSORIZER
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p Add a tensor with position-sensitive meaning representations to a #[code Doc].
|
||||
|
||||
+under-construction
|
21
website/docs/api/textcategorizer.jade
Normal file
21
website/docs/api/textcategorizer.jade
Normal file
|
@ -0,0 +1,21 @@
|
|||
//- 💫 DOCS > API > TEXTCATEGORIZER
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p
|
||||
| Add text categorization models to spaCy pipelines. The model supports
|
||||
| classification with multiple, non-mutually exclusive labels.
|
||||
|
||||
p
|
||||
| You can change the model architecture rather easily, but by default, the
|
||||
| #[code TextCategorizer] class uses a convolutional neural network to
|
||||
| assign position-sensitive vectors to each word in the document. This step
|
||||
| is similar to the #[+api("tensorizer") #[code Tensorizer]] component, but the
|
||||
| #[code TextCategorizer] uses its own CNN model, to avoid sharing weights
|
||||
| with the other pipeline components. The document tensor is then
|
||||
| summarized by concatenating max and mean pooling, and a multilayer
|
||||
| perceptron is used to predict an output vector of length #[code nr_class],
|
||||
| before a logistic activation is applied elementwise. The value of each
|
||||
| output neuron is the probability that some class is present.
|
||||
|
||||
+under-construction
|
|
@ -16,6 +16,7 @@
|
|||
"Rule-based matching": "rule-based-matching",
|
||||
"Adding languages": "adding-languages",
|
||||
"Processing pipelines": "language-processing-pipeline",
|
||||
"Text classification": "text-classification",
|
||||
"Deep learning": "deep-learning",
|
||||
"Production use": "production-use",
|
||||
"Training": "training",
|
||||
|
@ -106,6 +107,11 @@
|
|||
"next": "production use"
|
||||
},
|
||||
|
||||
"text-classification": {
|
||||
"title": "Text classification",
|
||||
"next": "training"
|
||||
},
|
||||
|
||||
"production-use": {
|
||||
"title": "Production use",
|
||||
"next": "training"
|
||||
|
|
|
@ -129,13 +129,6 @@ p
|
|||
| locations.
|
||||
+cell #[+procon("pro")]
|
||||
|
||||
+row
|
||||
+cell #[strong Rule-based Matching]
|
||||
+cell
|
||||
| Finding sequences of tokens based on their texts and linguistic
|
||||
| annotations, similar to regular expressions.
|
||||
+cell #[+procon("con")]
|
||||
|
||||
+row
|
||||
+cell #[strong Similarity]
|
||||
+cell
|
||||
|
@ -143,6 +136,18 @@ p
|
|||
| are to each other.
|
||||
+cell #[+procon("pro")]
|
||||
|
||||
+row
|
||||
+cell #[strong Text classification]
|
||||
+cell Assigning categories or labels to a whole document, or parts of a document.
|
||||
+cell #[+procon("pro")]
|
||||
|
||||
+row
|
||||
+cell #[strong Rule-based Matching]
|
||||
+cell
|
||||
| Finding sequences of tokens based on their texts and linguistic
|
||||
| annotations, similar to regular expressions.
|
||||
+cell #[+procon("con")]
|
||||
|
||||
+row
|
||||
+cell #[strong Training]
|
||||
+cell Updating and improving a statistical model's predictions.
|
||||
|
|
5
website/docs/usage/text-classification.jade
Normal file
5
website/docs/usage/text-classification.jade
Normal file
|
@ -0,0 +1,5 @@
|
|||
//- 💫 DOCS > USAGE > TEXT CLASSIFICATION
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
+under-construction
|
|
@ -38,6 +38,7 @@ p
|
|||
+item #[+a("#summary") Summary]
|
||||
+item #[+a("#features") New features]
|
||||
+item #[+a("#features-pipelines") Improved processing pipelines]
|
||||
+item #[+a("#features-text-classification") Text classification]
|
||||
+item #[+a("#features-hash-ids") Hash values instead of integer IDs]
|
||||
+item #[+a("#features-serializer") Saving, loading and serialization]
|
||||
+item #[+a("#features-displacy") displaCy visualizer]
|
||||
|
@ -102,6 +103,26 @@ p
|
|||
| #[strong API:] #[+api("language") #[code Language]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
|
||||
|
||||
+h(3, "features-text-classification") Text classification
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.lang.en import English
|
||||
nlp = English(pipeline=['tensorizer', 'tagger', 'textcat'])
|
||||
|
||||
p
|
||||
| spaCy v2.0 lets you add text categorization models to spaCy pipelines.
|
||||
| The model supports classification with multiple, non-mutually exclusive
|
||||
| labels – so multiple labels can apply at once. You can change the model
|
||||
| architecture rather easily, but by default, the #[code TextCategorizer]
|
||||
| class uses a convolutional neural network to assign position-sensitive
|
||||
| vectors to each word in the document.
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("textcategorizer") #[code TextCategorizer]],
|
||||
| #[+api("doc#attributes") #[code Doc.cats]],
|
||||
| #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br]
|
||||
| #[strong Usage:] #[+a("/docs/usage/text-classification") Text classification]
|
||||
|
||||
+h(3, "features-hash-ids") Hash values instead of integer IDs
|
||||
|
||||
+aside-code("Example").
|
||||
|
|
Loading…
Reference in New Issue
Block a user