Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-08-09 06:34:54 +03:00 · 2017-07-22 20:15:55 +02:00 · 2017-07-22 20:15:55 +02:00 · c27fdaef6f
commit c27fdaef6f
parent 2bc7d87c70 ab8ffbaab7
10 changed files with 106 additions and 9 deletions
--- a/website/docs/api/_data.json
+++ b/website/docs/api/_data.json
@ -17,9 +17,11 @@
            "Span": "span",
            "Language": "language",
            "Tokenizer": "tokenizer",
+            "Tensorizer": "tensorizer",
            "Tagger": "tagger",
            "DependencyParser": "dependencyparser",
            "EntityRecognizer": "entityrecognizer",
+            "TextCategorizer": "textcategorizer",
            "Matcher": "matcher",
            "Lexeme": "lexeme",
            "Vocab": "vocab",
@ -129,6 +131,12 @@
        "source": "spacy/pipeline.pyx"
    },

+    "textcategorizer": {
+        "title": "TextCategorizer",
+        "tag": "class",
+        "source": "spacy/pipeline.pyx"
+    },
+
    "dependencyparser": {
        "title": "DependencyParser",
        "tag": "class",
@ -147,6 +155,12 @@
        "source": "spacy/pipeline.pyx"
    },

+    "tensorizer": {
+        "title": "Tensorizer",
+        "tag": "class",
+        "source": "spacy/pipeline.pyx"
+    },
+
    "goldparse": {
        "title": "GoldParse",
        "tag": "class",
--- a/website/docs/api/annotation.jade
+++ b/website/docs/api/annotation.jade
@ -40,7 +40,7 @@ p
 +h(2, "pos-tagging") Part-of-speech Tagging

 +aside("Tip: Understanding tags")
-    |  You can also use #[code spacy.explain()] to get the escription for the
+    |  You can also use #[code spacy.explain()] to get the description for the
    |  string representation of a tag. For example,
    |  #[code spacy.explain("RB")] will return "adverb".

--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@ -558,10 +558,20 @@ p
        +cell The store of lexical types.

    +row
-        +cell #[code tensor]
+        +cell #[code tensor] #[+tag-new(2)]
        +cell object
        +cell Container for dense vector representations.

+    +row
+        +cell #[code cats] #[+tag-new(2)]
+        +cell dictionary
+        +cell
+            |  Maps either a label to a score for categories applied to whole
+            |  document, or #[code (start_char, end_char, label)] to score for
+            |  categories applied to spans. #[code start_char] and #[code end_char]
+            |  should be character offsets, label can be either a string or an
+            |  integer ID, and score should be a float.
+
    +row
        +cell #[code user_data]
        +cell -
--- a/website/docs/api/goldparse.jade
+++ b/website/docs/api/goldparse.jade
@ -103,6 +103,14 @@ p
        +cell list
        +cell The alignment from gold tokenization to candidate tokenization.

+    +row
+        +cell #[code cats] #[+tag-new(2)]
+        +cell list
+        +cell
+            |  Entries in the list should be either a label, or a
+            |  #[code (start, end, label)] triple. The tuple form is used for
+            |  categories applied to spans of the document.
+

 +h(2, "util") Utilities

--- a/website/docs/api/tensorizer.jade
+++ b/website/docs/api/tensorizer.jade
@ -0,0 +1,7 @@
+//- 💫 DOCS > API > TENSORIZER
+
+include ../../_includes/_mixins
+
+p Add a tensor with position-sensitive meaning representations to a #[code Doc].
+
+under-construction
--- a/website/docs/api/textcategorizer.jade
+++ b/website/docs/api/textcategorizer.jade
@ -0,0 +1,21 @@
+//- 💫 DOCS > API > TEXTCATEGORIZER
+
+include ../../_includes/_mixins
+
+p
+    |  Add text categorization models to spaCy pipelines. The model supports
+    |  classification with multiple, non-mutually exclusive labels.
+
+p
+    |  You can change the model architecture rather easily, but by default, the
+    |  #[code TextCategorizer] class uses a convolutional neural network to
+    |  assign position-sensitive vectors to each word in the document. This step
+    |  is similar to the #[+api("tensorizer") #[code Tensorizer]] component, but the
+    |  #[code TextCategorizer] uses its own CNN model, to avoid sharing weights
+    |  with the other pipeline components. The document tensor is then
+    |  summarized by concatenating max and mean pooling, and a multilayer
+    |  perceptron is used to predict an output vector of length #[code nr_class],
+    |  before a logistic activation is applied elementwise. The value of each
+    |  output neuron is the probability that some class is present.
+
+under-construction
--- a/website/docs/usage/_data.json
+++ b/website/docs/usage/_data.json
@ -16,6 +16,7 @@
            "Rule-based matching": "rule-based-matching",
            "Adding languages": "adding-languages",
            "Processing pipelines": "language-processing-pipeline",
+            "Text classification": "text-classification",
            "Deep learning": "deep-learning",
            "Production use": "production-use",
            "Training": "training",
@ -106,6 +107,11 @@
        "next": "production use"
    },

+    "text-classification": {
+        "title": "Text classification",
+        "next": "training"
+    },
+
    "production-use": {
        "title": "Production use",
        "next": "training"
--- a/website/docs/usage/spacy-101.jade
+++ b/website/docs/usage/spacy-101.jade
@ -129,13 +129,6 @@ p
            |  locations.
        +cell #[+procon("pro")]

-    +row
-        +cell #[strong Rule-based Matching]
-        +cell
-            |  Finding sequences of tokens based on their texts and linguistic
-            |  annotations, similar to regular expressions.
-        +cell #[+procon("con")]
-
    +row
        +cell #[strong Similarity]
        +cell
@ -143,6 +136,18 @@ p
            |  are to each other.
        +cell #[+procon("pro")]

+    +row
+        +cell #[strong Text classification]
+        +cell Assigning categories or labels to a whole document, or parts of a document.
+        +cell #[+procon("pro")]
+
+    +row
+        +cell #[strong Rule-based Matching]
+        +cell
+            |  Finding sequences of tokens based on their texts and linguistic
+            |  annotations, similar to regular expressions.
+        +cell #[+procon("con")]
+
    +row
        +cell #[strong Training]
        +cell Updating and improving a statistical model's predictions.
--- a/website/docs/usage/text-classification.jade
+++ b/website/docs/usage/text-classification.jade
@ -0,0 +1,5 @@
+//- 💫 DOCS > USAGE > TEXT CLASSIFICATION
+
+include ../../_includes/_mixins
+
+under-construction
--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@ -38,6 +38,7 @@ p
        +item #[+a("#summary") Summary]
        +item #[+a("#features") New features]
        +item #[+a("#features-pipelines") Improved processing pipelines]
+        +item #[+a("#features-text-classification") Text classification]
        +item #[+a("#features-hash-ids") Hash values instead of integer IDs]
        +item #[+a("#features-serializer") Saving, loading and serialization]
        +item #[+a("#features-displacy") displaCy visualizer]
@ -102,6 +103,26 @@ p
    |  #[strong API:] #[+api("language") #[code Language]]
    |  #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]

+h(3, "features-text-classification") Text classification
+
+aside-code("Example").
+    from spacy.lang.en import English
+    nlp = English(pipeline=['tensorizer', 'tagger', 'textcat'])
+
+p
+    |  spaCy v2.0 lets you add text categorization models to spaCy pipelines.
+    |  The model supports classification with multiple, non-mutually exclusive
+    |  labels – so multiple labels can apply at once. You can change the model
+    |  architecture rather easily, but by default, the #[code TextCategorizer]
+    |  class uses a convolutional neural network to assign position-sensitive
+    |  vectors to each word in the document.
+
+infobox
+    |  #[strong API:] #[+api("textcategorizer") #[code TextCategorizer]],
+    |  #[+api("doc#attributes") #[code Doc.cats]],
+    |  #[+api("goldparse#attributes") #[code GoldParse.cats]]#[br]
+    |  #[strong Usage:] #[+a("/docs/usage/text-classification") Text classification]
+
 +h(3, "features-hash-ids") Hash values instead of integer IDs

 +aside-code("Example").