From 1ddbeddca2df65947b900f63ccc04ce4dad152ae Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 22 Jul 2017 15:00:58 +0200 Subject: [PATCH 1/8] Fix typo --- website/docs/api/annotation.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index 048e69897..ce18878b7 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -40,7 +40,7 @@ p +h(2, "pos-tagging") Part-of-speech Tagging +aside("Tip: Understanding tags") - | You can also use #[code spacy.explain()] to get the escription for the + | You can also use #[code spacy.explain()] to get the description for the | string representation of a tag. For example, | #[code spacy.explain("RB")] will return "adverb". From 23d976ed00abb0d04ef1a35a7d42db0ef3e1942b Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 17:55:14 +0200 Subject: [PATCH 2/8] Add Doc.cats attribute and missing v2 tag --- website/docs/api/doc.jade | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade index f82a26c9e..929985144 100644 --- a/website/docs/api/doc.jade +++ b/website/docs/api/doc.jade @@ -558,10 +558,20 @@ p +cell The store of lexical types. +row - +cell #[code tensor] + +cell #[code tensor] #[+tag-new(2)] +cell object +cell Container for dense vector representations. + +row + +cell #[code cats] #[+tag-new(2)] + +cell dictionary + +cell + | Maps either a label to a score for categories applied to whole + | document, or #[code (start_char, end_char, label)] to score for + | categories applied to spans. #[code start_char] and #[code end_char] + | should be character offsets, label can be either a string or an + | integer ID, and score should be a float. + +row +cell #[code user_data] +cell - From d2a7e5b8e52ae8d90d365225ddc3eca918cf316a Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 17:55:35 +0200 Subject: [PATCH 3/8] Add GoldParse.cats attribute --- website/docs/api/goldparse.jade | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/website/docs/api/goldparse.jade b/website/docs/api/goldparse.jade index 7818912c3..03118343d 100644 --- a/website/docs/api/goldparse.jade +++ b/website/docs/api/goldparse.jade @@ -103,6 +103,14 @@ p +cell list +cell The alignment from gold tokenization to candidate tokenization. + +row + +cell #[code cats] #[+tag-new(2)] + +cell list + +cell + | Entries in the list should be either a label, or a + | #[code (start, end, label)] triple. The tuple form is used for + | categories applied to spans of the document. + +h(2, "util") Utilities From d05ab1b3a0f7ac4e586b4a295d7be8cccd49fa2c Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 17:55:53 +0200 Subject: [PATCH 4/8] Add text classification to 101 overview and change order --- website/docs/usage/spacy-101.jade | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/website/docs/usage/spacy-101.jade b/website/docs/usage/spacy-101.jade index f657ebf11..a54e5cf66 100644 --- a/website/docs/usage/spacy-101.jade +++ b/website/docs/usage/spacy-101.jade @@ -129,13 +129,6 @@ p | locations. +cell #[+procon("pro")] - +row - +cell #[strong Rule-based Matching] - +cell - | Finding sequences of tokens based on their texts and linguistic - | annotations, similar to regular expressions. - +cell #[+procon("con")] - +row +cell #[strong Similarity] +cell @@ -143,6 +136,18 @@ p | are to each other. +cell #[+procon("pro")] + +row + +cell #[strong Text classification] + +cell Assigning categories or labels to a whole document, or parts of a document. + +cell #[+procon("pro")] + + +row + +cell #[strong Rule-based Matching] + +cell + | Finding sequences of tokens based on their texts and linguistic + | annotations, similar to regular expressions. + +cell #[+procon("con")] + +row +cell #[strong Training] +cell Updating and improving a statistical model's predictions. From 0fb89dd204a4caec3d48bff3b8d0ec5868415759 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 17:56:07 +0200 Subject: [PATCH 5/8] Add text classification usage guide template --- website/docs/usage/_data.json | 6 ++++++ website/docs/usage/text-classification.jade | 5 +++++ 2 files changed, 11 insertions(+) create mode 100644 website/docs/usage/text-classification.jade diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index 81deeb402..c8373a095 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -16,6 +16,7 @@ "Rule-based matching": "rule-based-matching", "Adding languages": "adding-languages", "Processing pipelines": "language-processing-pipeline", + "Text classification": "text-classification", "Deep learning": "deep-learning", "Production use": "production-use", "Training": "training", @@ -106,6 +107,11 @@ "next": "production use" }, + "text-classification": { + "title": "Text classification", + "next": "training" + }, + "production-use": { "title": "Production use", "next": "training" diff --git a/website/docs/usage/text-classification.jade b/website/docs/usage/text-classification.jade new file mode 100644 index 000000000..33e384dbd --- /dev/null +++ b/website/docs/usage/text-classification.jade @@ -0,0 +1,5 @@ +//- 💫 DOCS > USAGE > TEXT CLASSIFICATION + +include ../../_includes/_mixins + ++under-construction From ab1a4e8b3c2426289ce0d0cb549e95494a53dfca Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 17:56:25 +0200 Subject: [PATCH 6/8] Add Tensorizer API docs stub --- website/docs/api/_data.json | 7 +++++++ website/docs/api/tensorizer.jade | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 website/docs/api/tensorizer.jade diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index a2e447dc8..1102c679a 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -17,6 +17,7 @@ "Span": "span", "Language": "language", "Tokenizer": "tokenizer", + "Tensorizer": "tensorizer", "Tagger": "tagger", "DependencyParser": "dependencyparser", "EntityRecognizer": "entityrecognizer", @@ -147,6 +148,12 @@ "source": "spacy/pipeline.pyx" }, + "tensorizer": { + "title": "Tensorizer", + "tag": "class", + "source": "spacy/pipeline.pyx" + }, + "goldparse": { "title": "GoldParse", "tag": "class", diff --git a/website/docs/api/tensorizer.jade b/website/docs/api/tensorizer.jade new file mode 100644 index 000000000..9abd6793b --- /dev/null +++ b/website/docs/api/tensorizer.jade @@ -0,0 +1,7 @@ +//- 💫 DOCS > API > TENSORIZER + +include ../../_includes/_mixins + +p Add a tensor with position-sensitive meaning representations to a #[code Doc]. + ++under-construction From f085b88f9d61f701f4b7d937b584dfe6ef4fc35d Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 17:56:33 +0200 Subject: [PATCH 7/8] Add TextCategorizer API docs stub --- website/docs/api/_data.json | 7 +++++++ website/docs/api/textcategorizer.jade | 21 +++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 website/docs/api/textcategorizer.jade diff --git a/website/docs/api/_data.json b/website/docs/api/_data.json index 1102c679a..e413f200c 100644 --- a/website/docs/api/_data.json +++ b/website/docs/api/_data.json @@ -21,6 +21,7 @@ "Tagger": "tagger", "DependencyParser": "dependencyparser", "EntityRecognizer": "entityrecognizer", + "TextCategorizer": "textcategorizer", "Matcher": "matcher", "Lexeme": "lexeme", "Vocab": "vocab", @@ -130,6 +131,12 @@ "source": "spacy/pipeline.pyx" }, + "textcategorizer": { + "title": "TextCategorizer", + "tag": "class", + "source": "spacy/pipeline.pyx" + }, + "dependencyparser": { "title": "DependencyParser", "tag": "class", diff --git a/website/docs/api/textcategorizer.jade b/website/docs/api/textcategorizer.jade new file mode 100644 index 000000000..926d957f7 --- /dev/null +++ b/website/docs/api/textcategorizer.jade @@ -0,0 +1,21 @@ +//- 💫 DOCS > API > TEXTCATEGORIZER + +include ../../_includes/_mixins + +p + | Add text categorization models to spaCy pipelines. The model supports + | classification with multiple, non-mutually exclusive labels. + +p + | You can change the model architecture rather easily, but by default, the + | #[code TextCategorizer] class uses a convolutional neural network to + | assign position-sensitive vectors to each word in the document. This step + | is similar to the #[+api("tensorizer") #[code Tensorizer]] component, but the + | #[code TextCategorizer] uses its own CNN model, to avoid sharing weights + | with the other pipeline components. The document tensor is then + | summarized by concatenating max and mean pooling, and a multilayer + | perceptron is used to predict an output vector of length #[code nr_class], + | before a logistic activation is applied elementwise. The value of each + | output neuron is the probability that some class is present. + ++under-construction From ab8ffbaab7f01524c6633e94d3c485704dbd4b2d Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 22 Jul 2017 17:56:51 +0200 Subject: [PATCH 8/8] Add text classification to v2 overview --- website/docs/usage/v2.jade | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/website/docs/usage/v2.jade b/website/docs/usage/v2.jade index bbcfe865f..d9727c62b 100644 --- a/website/docs/usage/v2.jade +++ b/website/docs/usage/v2.jade @@ -38,6 +38,7 @@ p +item #[+a("#summary") Summary] +item #[+a("#features") New features] +item #[+a("#features-pipelines") Improved processing pipelines] + +item #[+a("#features-text-classification") Text classification] +item #[+a("#features-hash-ids") Hash values instead of integer IDs] +item #[+a("#features-serializer") Saving, loading and serialization] +item #[+a("#features-displacy") displaCy visualizer] @@ -102,6 +103,26 @@ p | #[strong API:] #[+api("language") #[code Language]] | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] ++h(3, "features-text-classification") Text classification + ++aside-code("Example"). + from spacy.lang.en import English + nlp = English(pipeline=['tensorizer', 'tagger', 'textcat']) + +p + | spaCy v2.0 lets you add text categorization models to spaCy pipelines. + | The model supports classification with multiple, non-mutually exclusive + | labels – so multiple labels can apply at once. You can change the model + | architecture rather easily, but by default, the #[code TextCategorizer] + | class uses a convolutional neural network to assign position-sensitive + | vectors to each word in the document. + ++infobox + | #[strong API:] #[+api("textcategorizer") #[code TextCategorizer]], + | #[+api("doc#attributes") #[code Doc.cats]], + | #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br] + | #[strong Usage:] #[+a("/docs/usage/text-classification") Text classification] + +h(3, "features-hash-ids") Hash values instead of integer IDs +aside-code("Example").