Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-07-22 20:15:55 +02:00
commit c27fdaef6f
10 changed files with 106 additions and 9 deletions

View File

@ -17,9 +17,11 @@
"Span": "span",
"Language": "language",
"Tokenizer": "tokenizer",
"Tensorizer": "tensorizer",
"Tagger": "tagger",
"DependencyParser": "dependencyparser",
"EntityRecognizer": "entityrecognizer",
"TextCategorizer": "textcategorizer",
"Matcher": "matcher",
"Lexeme": "lexeme",
"Vocab": "vocab",
@ -129,6 +131,12 @@
"source": "spacy/pipeline.pyx"
},
"textcategorizer": {
"title": "TextCategorizer",
"tag": "class",
"source": "spacy/pipeline.pyx"
},
"dependencyparser": {
"title": "DependencyParser",
"tag": "class",
@ -147,6 +155,12 @@
"source": "spacy/pipeline.pyx"
},
"tensorizer": {
"title": "Tensorizer",
"tag": "class",
"source": "spacy/pipeline.pyx"
},
"goldparse": {
"title": "GoldParse",
"tag": "class",

View File

@ -40,7 +40,7 @@ p
+h(2, "pos-tagging") Part-of-speech Tagging
+aside("Tip: Understanding tags")
| You can also use #[code spacy.explain()] to get the escription for the
| You can also use #[code spacy.explain()] to get the description for the
| string representation of a tag. For example,
| #[code spacy.explain("RB")] will return "adverb".

View File

@ -558,10 +558,20 @@ p
+cell The store of lexical types.
+row
+cell #[code tensor]
+cell #[code tensor] #[+tag-new(2)]
+cell object
+cell Container for dense vector representations.
+row
+cell #[code cats] #[+tag-new(2)]
+cell dictionary
+cell
| Maps either a label to a score for categories applied to whole
| document, or #[code (start_char, end_char, label)] to score for
| categories applied to spans. #[code start_char] and #[code end_char]
| should be character offsets, label can be either a string or an
| integer ID, and score should be a float.
+row
+cell #[code user_data]
+cell -

View File

@ -103,6 +103,14 @@ p
+cell list
+cell The alignment from gold tokenization to candidate tokenization.
+row
+cell #[code cats] #[+tag-new(2)]
+cell list
+cell
| Entries in the list should be either a label, or a
| #[code (start, end, label)] triple. The tuple form is used for
| categories applied to spans of the document.
+h(2, "util") Utilities

View File

@ -0,0 +1,7 @@
//- 💫 DOCS > API > TENSORIZER
include ../../_includes/_mixins
p Add a tensor with position-sensitive meaning representations to a #[code Doc].
+under-construction

View File

@ -0,0 +1,21 @@
//- 💫 DOCS > API > TEXTCATEGORIZER
include ../../_includes/_mixins
p
| Add text categorization models to spaCy pipelines. The model supports
| classification with multiple, non-mutually exclusive labels.
p
| You can change the model architecture rather easily, but by default, the
| #[code TextCategorizer] class uses a convolutional neural network to
| assign position-sensitive vectors to each word in the document. This step
| is similar to the #[+api("tensorizer") #[code Tensorizer]] component, but the
| #[code TextCategorizer] uses its own CNN model, to avoid sharing weights
| with the other pipeline components. The document tensor is then
| summarized by concatenating max and mean pooling, and a multilayer
| perceptron is used to predict an output vector of length #[code nr_class],
| before a logistic activation is applied elementwise. The value of each
| output neuron is the probability that some class is present.
+under-construction

View File

@ -16,6 +16,7 @@
"Rule-based matching": "rule-based-matching",
"Adding languages": "adding-languages",
"Processing pipelines": "language-processing-pipeline",
"Text classification": "text-classification",
"Deep learning": "deep-learning",
"Production use": "production-use",
"Training": "training",
@ -106,6 +107,11 @@
"next": "production use"
},
"text-classification": {
"title": "Text classification",
"next": "training"
},
"production-use": {
"title": "Production use",
"next": "training"

View File

@ -129,13 +129,6 @@ p
| locations.
+cell #[+procon("pro")]
+row
+cell #[strong Rule-based Matching]
+cell
| Finding sequences of tokens based on their texts and linguistic
| annotations, similar to regular expressions.
+cell #[+procon("con")]
+row
+cell #[strong Similarity]
+cell
@ -143,6 +136,18 @@ p
| are to each other.
+cell #[+procon("pro")]
+row
+cell #[strong Text classification]
+cell Assigning categories or labels to a whole document, or parts of a document.
+cell #[+procon("pro")]
+row
+cell #[strong Rule-based Matching]
+cell
| Finding sequences of tokens based on their texts and linguistic
| annotations, similar to regular expressions.
+cell #[+procon("con")]
+row
+cell #[strong Training]
+cell Updating and improving a statistical model's predictions.

View File

@ -0,0 +1,5 @@
//- 💫 DOCS > USAGE > TEXT CLASSIFICATION
include ../../_includes/_mixins
+under-construction

View File

@ -38,6 +38,7 @@ p
+item #[+a("#summary") Summary]
+item #[+a("#features") New features]
+item #[+a("#features-pipelines") Improved processing pipelines]
+item #[+a("#features-text-classification") Text classification]
+item #[+a("#features-hash-ids") Hash values instead of integer IDs]
+item #[+a("#features-serializer") Saving, loading and serialization]
+item #[+a("#features-displacy") displaCy visualizer]
@ -102,6 +103,26 @@ p
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
+h(3, "features-text-classification") Text classification
+aside-code("Example").
from spacy.lang.en import English
nlp = English(pipeline=['tensorizer', 'tagger', 'textcat'])
p
| spaCy v2.0 lets you add text categorization models to spaCy pipelines.
| The model supports classification with multiple, non-mutually exclusive
| labels so multiple labels can apply at once. You can change the model
| architecture rather easily, but by default, the #[code TextCategorizer]
| class uses a convolutional neural network to assign position-sensitive
| vectors to each word in the document.
+infobox
| #[strong API:] #[+api("textcategorizer") #[code TextCategorizer]],
| #[+api("doc#attributes") #[code Doc.cats]],
| #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br]
| #[strong Usage:] #[+a("/docs/usage/text-classification") Text classification]
+h(3, "features-hash-ids") Hash values instead of integer IDs
+aside-code("Example").