* Work on documentation. Have overall structure now

2025-10-26 05:31:15 +03:00 · 2015-08-12 20:21:26 +02:00 · 2015-08-12 20:21:26 +02:00 · c767ab9fdf
commit c767ab9fdf
parent ab39f358c1
1 changed files with 559 additions and 417 deletions
--- a/docs/redesign/spacy_docs.jade
+++ b/docs/redesign/spacy_docs.jade
@ -1,17 +1,19 @@
- var unicode_type = '<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>'
+- var py_docs = '<a class="reference" href="http://docs.python.org/library/'
 - var bool_type = '<a class="reference" href="http://docs.python.org/library/functions.html#bool"><em>bool</em></a>'
 - var int_type = ""
 - var Token_type = ""
 - var Span_type = ""
 - var Vocab_type = ""
 - var generator_type = ""
 -
  var types = {
   'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
   'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
   'int': py_docs + 'functions.html#int"><em>int</em></a>',
   'generator': "",
   'Vocab': "",
   'Span': "",
   'Doc': ""
  }
 mixin declare_class(name)
-  details(open="true")
+  details
    summary
      span.declaration
        span.label class
@ -62,14 +64,54 @@ mixin returns(name, type, value)
 mixin returns(type)
  | tmp
 mixin init
  details
    summary: h4 Init
    block
 mixin callable
  details
    summary: h4 Callable
    block
 mixin sequence
  details
    summary: h4 Sequence
    block
 mixin maptype
  details
    summary: h4 Map
    block
 mixin summary
  block
 mixin en_example
  pre.language-python
    code
      | from spacy.en import English
      | from spacy._doc_examples import download_war_and_peace
      | 
      | unprocessed_unicode = download_war_and_peace()
      | 
      | nlp = English()
      | doc = nlp(unprocessed_unicode)
 doctype html
 html(lang="en")
  head
    meta(charset="utf-8")
-    title!= tag_line
+    title spaCy &ndash; Industrial-strength NLP
    meta(name="description" content="")
    meta(name="author" content="Matthew Honnibal")
    link(rel="stylesheet" href="css/style.css")
@ -78,9 +120,9 @@ html(lang="en")
    <![endif]-->
  body(id="docs")
-    header
+    header(role="banner")
-      h1.logo!= tag_line
+      h1.logo spaCy &ndash; Industrial-strength NLP
-      div.slogan!= slogan
+      div.slogan API
    nav(role="navigation")
@ -91,14 +133,27 @@ html(lang="en")
        li: a(href="#") Blog
    main.docs#content
      section.intro
          | Tmp
      article
-        h3: a(href="#") Header
+        +declare_class("English")
          p Load models into a callable object to process English text.
-        +declare_class("spacy.en.English")
+          +summary
-          +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")
+            +en_example
          +init
            p
              | Load the resources.  Loading takes 20 seconds, and the instance
              | consumes 2 to 3 gigabytes of memory.
            p 
              | Intended use is for one instance to be created per process.
              | You can create more if you're doing something unusual.
            p
              | You may wish to make the instance a global variable or "singleton".
              | We usually instantiate the object in the <code>main()</code>
              | function and pass it around as an explicit argument. 
            +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
              +params
                +param("data_dir")
@ -120,11 +175,11 @@ html(lang="en")
                +param("load_vectors")
                  | A boolean value to control whether the word vectors are loaded.
-
+          +callable
-          +method("__call__", "text, tag=True, parse=True, entity=True")(open)
+            +method("__call__", "text, tag=True, parse=True, entity=True")
              +params
-              +param("text", unicode_type)
+                +param("text", types.unicode)
                  | The text to be processed.  No pre-processing needs to be applied,
                  | and any length of text can be submitted.  Usually you will submit
                  | a whole document. Text may be zero-length. An exception is raised
@ -152,17 +207,22 @@ html(lang="en")
                    | # doc = nlp(b'Some text') <-- Error: need unicode
                    | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
-      +declare_class("spacy.tokens.doc.Doc")
+
        +declare_class("Doc")
          p I'm a doc
          +init
            +method("__init__", "vocab")
              +params
                +param("vocab", vocab_type)
                  | A vocabulary object
-        +method("__getitem__", "i", int_type)
+          +sequence
-          +returns(Token_type)
+            +method("__getitem__", "i", types.int)
              +returns(types.Token)
-        +method("__getitem__", "start_end", slice_type)
+            +method("__getitem__", "start_end", types.slice)
-          +returns(Span_type)
+              +returns(types.Span)
            +method("__iter__")
              | Iterate over tokens
@ -170,13 +230,19 @@ html(lang="en")
            +method("__len__")
              | Number of tokens in the document.
-        +attribute("sents", generator_type)
+          details
            summary: h4 Spans
            +attribute("sents", types.generator)
              | Iterate over sentences in the document.
-        +attribute("ents", generator_type)
+            +attribute("ents", types.generator)
              | Iterate over named entities in the document.
-        +attribute("noun_chunks", generator_type)
+            +attribute("noun_chunks", types.generator)
          details
            summary: h4 Export/Import
            +method("to_array", "attr_ids")
@ -184,7 +250,6 @@ html(lang="en")
              | of shape N*M, where N is the length of the sentence.
              +params
                +param("attr_ids", "list[int]")
                  | A list of attribute ID ints.
@ -193,7 +258,6 @@ html(lang="en")
                | indicated in the input attr_ids.
            +method("count_by", "attr_id")
              | Produce a dict of {attribute (int): count (ints)} frequencies, keyed
              | by the values of the given attribute ID.
@ -213,31 +277,29 @@ html(lang="en")
            +method("from_array", "attrs, array")
              | Load from array
        +method("to_bytes")
          | Serialize
            +method("from_bytes")
              | Deserialize, loading from bytes
            +method("read_bytes")
              | classmethod
-        +method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
+            //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
-          | Merge a multi-word expression into a single token.  Currently
+            //  | Merge a multi-word expression into a single token.  Currently
-          | experimental; API is likely to change.
+            //  | experimental; API is likely to change.
-      +declare_class("spacy.tokens.Token")
+        +declare_class("Token")
          +init
            +method("__init__", "vocab, doc, offset")
              +params
-            +param("vocab", Vocab_type)
+                +param("vocab", types.Vocab)
                  p A Vocab object
-            +param("doc", Doc_type)
+                +param("doc", types.Doc)
                  p The parent sequence
-          +param("offset", Int_type)
+              +param("offset", types.int)
                p The index of the token within the document
          details
@ -336,11 +398,13 @@ html(lang="en")
            summary: h4 Syntactic Tags
            +attribute("pos / pos_")
              p
                | A part-of-speech tag, from the Google Universal Tag Set, e.g. 
                | code>NOUN</code>, <code>VERB</code>, <code>ADV</code>.  Constants for
                | the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
            +attribute("tag / tag_")
              p
                | A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
                | <code>DT</code>, etc.  These tags are language/corpus specific, and
                | typically describe part-of-speech and some amount of morphological
@ -348,6 +412,7 @@ html(lang="en")
                | is assigned to a present-tense singular verb.
            +attribute("dep / dep_")
              p
                | The type of syntactic dependency relation between the word and its
                | syntactic head.
@ -426,8 +491,14 @@ html(lang="en")
            //+attribute("conjuncts")
            //  | Conjuncts
-      +declare_class("spacy.tokens.span.Span")
+        +declare_class("Span")
-        +params
+          +init
            +method("__init__")
              Temp
            <code>span = doc[0:4]</code>
          +sequence
            +method("__getitem__")
              p Get item
@ -437,6 +508,9 @@ html(lang="en")
            +method("__len__")
              p Len
          details
            summary: h4 Parse
            +attribute("root")
              p Syntactic head
@ -464,6 +538,13 @@ html(lang="en")
                  | rights = [span.doc[i] for i in range(span.end, len(span.doc))
                  |           if span.doc[i].head in span]
            +attribute("subtree")
              p String
          details
            summary: h4 String Views
            +attribute("string")
              p String
@ -473,14 +554,61 @@ html(lang="en")
            +attribute("label / label_")
              p String
-          +attribute("subtree")
+        +declare_class("Lexeme")
-            p String
+          p
            | The Lexeme object represents a lexical type, stored in the vocabulary
            | &ndash; as opposed to a token, occurring in a document.
          p
            | Lexemes store various features, so that these features can be computed
            | once per type, rather than once per token. As job sizes grow, this
            | can amount to a substantial efficiency improvement.
-      +declare_class("spacy.vocab.Vocab", "data_dir=None, lex_props_getter=None")
+          p
            | All Lexeme attributes are therefore context independent, as a single
            | lexeme is reused for all usages of that word. Lexemes are keyed by
            | the “orth” attribute.
          p
            All Lexeme attributes are accessible directly on the Token object.
          +init
            +method("__init__")
              p Init
            details
              summary: h4 String Features
                +attribute("orth / orth_")
                  p
                    | The form of the word with no string normalization or processing,
                    | as it appears in the string, without trailing whitespace.
                +attribute("lower / lower_")
                  p Tmp
                +attribute("norm / norm_")
                  p Tmp
                +attribute("shape / shape_")
                  p Tmp
                +attribute("prefix / prefix_")
                  p Tmp
                +attribute("suffix / suffix_")
                  p TMP
        +declare_class("Vocab", "data_dir=None, lex_props_getter=None")
          +sequence
            +method("__len__")
              +returns
                p Number of words in the vocabulary.
            +method("__iter__")
              +returns
                p Lexeme
          +maptype
            +method("__getitem__", "key_int")
              +params
                +param("key")
@ -490,48 +618,59 @@ html(lang="en")
            +method("__getitem__", "key_str")
              +params
-            +param("key_str", unicode_type)
+                +param("key_str", types.unicode)
                  p A string in the vocabulary
              +returns("Lexeme")
            +method("__setitem__", "orth_str", "props")
              +params
-            +param("orth_str", unicode_type)
+                +param("orth_str", types.unicode)
                  p The orth key
-            +param("props", dict_type)
+                +param("props", types.dict)
                  p A props dictionary
              +returns("None")
          details
            summary: h4 Import/Export
            +method("dump", "loc")
              +params
-            +param("loc", unicode_type)
+                +param("loc", types.unicode)
                  p Path where the vocabulary should be saved
            +method("load_lexemes", "loc")
            +params
-            +param("loc", unicode_type)
+              +param("loc", types.unicode)
                p Path to load the lexemes.bin file from
            +method("load_vectors", "loc")
              +params
-            +param("loc", unicode_type)
+                +param("loc", types.unicode)
                  p Path to load the vectors.bin from
        +declare_class("StringStore")
          +init
            Tmp
-      +declare_class("spacy.strings.StringStore")
+          +sequence
            +method("__len__")
              +returns("int")
                p Number of strings in the string-store
            +method("__iter__")
              +returns
                p Lexeme
          +maptype
            +method("__getitem__", "key_int")
              +params
                +param("key_int")
                  p An integer key
-          +returns(unicode_type)
+              +returns(types.unicode)
                p The string that the integer key maps to
            +method("__getitem__", "key_unicode")
@ -539,17 +678,20 @@ html(lang="en")
                +param("key_unicode")
                  p A key, as a unicode string
-          +returns(int_type)
+              +returns(types.int)
                p The integer ID of the string.
            +method("__getitem__", "key_utf8_bytes")
              +params
-            +param("key_utf8_bytes", bytes_type)
+                +param("key_utf8_bytes", types.bytes)
                  p p A key, as a UTF-8 encoded byte-string
-          +returns(int_type)
+              +returns(types.int)
                p The integer ID of the string.
          details
            summary: h4 Import/Export
            +method("dump", "loc")
              +params
                +param("loc")