Document current JSON format for training

2025-08-04 04:10:20 +03:00 · 2017-10-24 15:50:41 +02:00 · 2017-10-24 15:50:41 +02:00 · c9dc88ddfc
commit c9dc88ddfc
parent 2b8e7c45e0
3 changed files with 51 additions and 28 deletions
--- a/website/api/_annotation/_training.jade
+++ b/website/api/_annotation/_training.jade
@ -0,0 +1,46 @@
+//- 💫 DOCS > API > ANNOTATION > TRAINING
+
+p
+    |  spaCy takes training data in JSON format. The built-in
+    |  #[+api("cli#convert") #[code convert]] command helps you convert the
+    |  #[code .conllu] format used by the
+    |  #[+a("https://github.com/UniversalDependencies") Universal Dependencies corpora]
+    |  to spaCy's training format.
+
+aside("Annotating entities")
+    |  Named entities are provided in the #[+a("/api/annotation#biluo") BILUO]
+    |  notation. Tokens outside an entity are set to #[code "O"] and tokens
+    |  that are part of an entity are set to the entity label, prefixed by the
+    |  BILUO marker. For example #[code "B-ORG"] describes the first token of
+    |  a multi-token #[code ORG] entity and #[code "U-PERSON"] a single
+    |  token representing a #[code PERSON] entity
+
+code("Example structure").
+    [{
+        "id": int,                      # ID of the document within the corpus
+        "paragraphs": [{                # list of paragraphs in the corpus
+            "raw": string,              # raw text of the paragraph
+            "sentences": [{             # list of sentences in the paragraph
+                "tokens": [{            # list of tokens in the sentence
+                    "id": int,          # index of the token in the document
+                    "dep": string,      # dependency label
+                    "head": int,        # offset of token head relative to token index
+                    "tag": string,      # part-of-speech tag
+                    "orth": string,     # verbatim text of the token
+                    "ner": string       # BILUO label, e.g. "O" or "B-ORG"
+                }],
+                "brackets": [{          # phrase structure (NOT USED by current models)
+                    "first": int,       # index of first token
+                    "last": int,        # index of last token
+                    "label": string     # phrase label
+                }]
+            }]
+        }]
+    }]
+
+p
+    |  Here's an example of dependencies, part-of-speech tags and names
+    |  entities, taken from the English Wall Street Journal portion of the Penn
+    |  Treebank:
+
+github("spacy", "examples/training/training-data.json", false, false, "json")
--- a/website/api/annotation.jade
+++ b/website/api/annotation.jade
@ -101,31 +101,4 @@ p This document describes the target annotations spaCy is trained to predict.
 +section("training")
    +h(2, "json-input") JSON input format for training

-    +under-construction
-
-    p spaCy takes training data in the following format:
-
-    +code("Example structure").
-        doc: {
-            id: string,
-            paragraphs: [{
-                raw: string,
-                sents: [int],
-                tokens: [{
-                    start: int,
-                    tag: string,
-                    head: int,
-                    dep: string
-                }],
-                ner: [{
-                    start: int,
-                    end: int,
-                    label: string
-                }],
-                brackets: [{
-                    start: int,
-                    end: int,
-                    label: string
-                }]
-            }]
-        }
+    include _annotation/_training
--- a/website/usage/_training/_tagger-parser.jade
+++ b/website/usage/_training/_tagger-parser.jade
@ -1,3 +1,7 @@
 //- 💫 DOCS > USAGE > TRAINING > TAGGER & PARSER

 +under-construction
+
+h(3, "training-json") JSON format for training
+
+include ../../api/_annotation/_training