diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index a2e447dc8..e413f200c 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -17,9 +17,11 @@ "Span": "span", "Language": "language", "Tokenizer": "tokenizer", + "Tensorizer": "tensorizer", "Tagger": "tagger", "DependencyParser": "dependencyparser", "EntityRecognizer": "entityrecognizer", + "TextCategorizer": "textcategorizer", "Matcher": "matcher", "Lexeme": "lexeme", "Vocab": "vocab", @@ -129,6 +131,12 @@ "source": "spacy/pipeline.pyx" }, + "textcategorizer": { + "title": "TextCategorizer", + "tag": "class", + "source": "spacy/pipeline.pyx" + }, + "dependencyparser": { "title": "DependencyParser", "tag": "class", @@ -147,6 +155,12 @@ "source": "spacy/pipeline.pyx" }, + "tensorizer": { + "title": "Tensorizer", + "tag": "class", + "source": "spacy/pipeline.pyx" + }, + "goldparse": { "title": "GoldParse", "tag": "class", diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index 048e69897..ce18878b7 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -40,7 +40,7 @@ p +h(2, "pos-tagging") Part-of-speech Tagging +aside("Tip: Understanding tags") - | You can also use #[code spacy.explain()] to get the escription for the + | You can also use #[code spacy.explain()] to get the description for the | string representation of a tag. For example, | #[code spacy.explain("RB")] will return "adverb". diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index f82a26c9e..929985144 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -558,10 +558,20 @@ p +cell The store of lexical types. +row - +cell #[code tensor] + +cell #[code tensor] #[+tag-new(2)] +cell object +cell Container for dense vector representations. + +row + +cell #[code cats] #[+tag-new(2)] + +cell dictionary + +cell + | Maps either a label to a score for categories applied to whole + | document, or #[code (start_char, end_char, label)] to score for + | categories applied to spans. #[code start_char] and #[code end_char] + | should be character offsets, label can be either a string or an + | integer ID, and score should be a float. + +row +cell #[code user_data] +cell - diff --git a/website/docs/api/goldparse.jade b/website/docs/api/goldparse.jade index 7818912c3..03118343d 100644 --- a/website/docs/api/goldparse.jade +++ b/website/docs/api/goldparse.jade @@ -103,6 +103,14 @@ p +cell list +cell The alignment from gold tokenization to candidate tokenization. + +row + +cell #[code cats] #[+tag-new(2)] + +cell list + +cell + | Entries in the list should be either a label, or a + | #[code (start, end, label)] triple. The tuple form is used for + | categories applied to spans of the document. + +h(2, "util") Utilities diff --git a/website/docs/api/tensorizer.jade b/website/docs/api/tensorizer.jade new file mode 100644 index 000000000..9abd6793b --- /dev/null +++ b/website/docs/api/tensorizer.jade @@ -0,0 +1,7 @@ +//- 💫 DOCS > API > TENSORIZER + +include ../../_includes/_mixins + +p Add a tensor with position-sensitive meaning representations to a #[code Doc]. + ++under-construction diff --git a/website/docs/api/textcategorizer.jade b/website/docs/api/textcategorizer.jade new file mode 100644 index 000000000..926d957f7 --- /dev/null +++ b/website/docs/api/textcategorizer.jade @@ -0,0 +1,21 @@ +//- 💫 DOCS > API > TEXTCATEGORIZER + +include ../../_includes/_mixins + +p + | Add text categorization models to spaCy pipelines. The model supports + | classification with multiple, non-mutually exclusive labels. + +p + | You can change the model architecture rather easily, but by default, the + | #[code TextCategorizer] class uses a convolutional neural network to + | assign position-sensitive vectors to each word in the document. This step + | is similar to the #[+api("tensorizer") #[code Tensorizer]] component, but the + | #[code TextCategorizer] uses its own CNN model, to avoid sharing weights + | with the other pipeline components. The document tensor is then + | summarized by concatenating max and mean pooling, and a multilayer + | perceptron is used to predict an output vector of length #[code nr_class], + | before a logistic activation is applied elementwise. The value of each + | output neuron is the probability that some class is present. + ++under-construction diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 81deeb402..c8373a095 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -16,6 +16,7 @@ "Rule-based matching": "rule-based-matching", "Adding languages": "adding-languages", "Processing pipelines": "language-processing-pipeline", + "Text classification": "text-classification", "Deep learning": "deep-learning", "Production use": "production-use", "Training": "training", @@ -106,6 +107,11 @@ "next": "production use" }, + "text-classification": { + "title": "Text classification", + "next": "training" + }, + "production-use": { "title": "Production use", "next": "training" diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index f657ebf11..a54e5cf66 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -129,13 +129,6 @@ p | locations. +cell #[+procon("pro")] - +row - +cell #[strong Rule-based Matching] - +cell - | Finding sequences of tokens based on their texts and linguistic - | annotations, similar to regular expressions. - +cell #[+procon("con")] - +row +cell #[strong Similarity] +cell @@ -143,6 +136,18 @@ p | are to each other. +cell #[+procon("pro")] + +row + +cell #[strong Text classification] + +cell Assigning categories or labels to a whole document, or parts of a document. + +cell #[+procon("pro")] + + +row + +cell #[strong Rule-based Matching] + +cell + | Finding sequences of tokens based on their texts and linguistic + | annotations, similar to regular expressions. + +cell #[+procon("con")] + +row +cell #[strong Training] +cell Updating and improving a statistical model's predictions. diff --git a/website/docs/usage/text-classification.jade b/website/docs/usage/text-classification.jade new file mode 100644 index 000000000..33e384dbd --- /dev/null +++ b/website/docs/usage/text-classification.jade @@ -0,0 +1,5 @@ +//- 💫 DOCS > USAGE > TEXT CLASSIFICATION + +include ../../_includes/_mixins + ++under-construction diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index bbcfe865f..d9727c62b 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -38,6 +38,7 @@ p +item #[+a("#summary") Summary] +item #[+a("#features") New features] +item #[+a("#features-pipelines") Improved processing pipelines] + +item #[+a("#features-text-classification") Text classification] +item #[+a("#features-hash-ids") Hash values instead of integer IDs] +item #[+a("#features-serializer") Saving, loading and serialization] +item #[+a("#features-displacy") displaCy visualizer] @@ -102,6 +103,26 @@ p | #[strong API:] #[+api("language") #[code Language]] | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] ++h(3, "features-text-classification") Text classification + ++aside-code("Example"). + from spacy.lang.en import English + nlp = English(pipeline=['tensorizer', 'tagger', 'textcat']) + +p + | spaCy v2.0 lets you add text categorization models to spaCy pipelines. + | The model supports classification with multiple, non-mutually exclusive + | labels – so multiple labels can apply at once. You can change the model + | architecture rather easily, but by default, the #[code TextCategorizer] + | class uses a convolutional neural network to assign position-sensitive + | vectors to each word in the document. + ++infobox + | #[strong API:] #[+api("textcategorizer") #[code TextCategorizer]], + | #[+api("doc#attributes") #[code Doc.cats]], + | #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br] + | #[strong Usage:] #[+a("/docs/usage/text-classification") Text classification] + +h(3, "features-hash-ids") Hash values instead of integer IDs +aside-code("Example").