* Work on documentation. Have overall structure now

2025-07-16 03:02:41 +03:00 · 2015-08-12 20:21:26 +02:00 · 2015-08-12 20:21:26 +02:00 · c767ab9fdf
commit c767ab9fdf
parent ab39f358c1
1 changed files with 559 additions and 417 deletions
--- a/docs/redesign/spacy_docs.jade
+++ b/docs/redesign/spacy_docs.jade
@ -1,17 +1,19 @@
- var unicode_type = '<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>'
- var bool_type = '<a class="reference" href="http://docs.python.org/library/functions.html#bool"><em>bool</em></a>'
-
- var int_type = ""
-
- var Token_type = ""
- var Span_type = ""
- var Vocab_type = ""
- var generator_type = ""
+- var py_docs = '<a class="reference" href="http://docs.python.org/library/'

+-
+  var types = {
+   'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
+   'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
+   'int': py_docs + 'functions.html#int"><em>int</em></a>',
+   'generator': "",
+   'Vocab': "",
+   'Span': "",
+   'Doc': ""
+  }


 mixin declare_class(name)
-  details(open="true")
+  details
    summary
      span.declaration
        span.label class
@ -62,14 +64,54 @@ mixin returns(name, type, value)
 mixin returns(type)
  | tmp

+mixin init
+  details
+    summary: h4 Init

+    block
+
+
+mixin callable
+  details
+    summary: h4 Callable
+
+    block
+
+
+mixin sequence
+  details
+    summary: h4 Sequence
+
+    block
+
+
+mixin maptype
+  details
+    summary: h4 Map
+
+    block
+
+
+mixin summary
+  block
+
+mixin en_example
+  pre.language-python
+    code
+      | from spacy.en import English
+      | from spacy._doc_examples import download_war_and_peace
+      | 
+      | unprocessed_unicode = download_war_and_peace()
+      | 
+      | nlp = English()
+      | doc = nlp(unprocessed_unicode)


 doctype html
 html(lang="en")
  head
    meta(charset="utf-8")
-    title!= tag_line
+    title spaCy &ndash; Industrial-strength NLP
    meta(name="description" content="")
    meta(name="author" content="Matthew Honnibal")
    link(rel="stylesheet" href="css/style.css")
@ -78,9 +120,9 @@ html(lang="en")
    <![endif]-->

  body(id="docs")
-    header
-      h1.logo!= tag_line
-      div.slogan!= slogan
+    header(role="banner")
+      h1.logo spaCy &ndash; Industrial-strength NLP
+      div.slogan API


    nav(role="navigation")
@ -91,14 +133,27 @@ html(lang="en")
        li: a(href="#") Blog

    main.docs#content
-      section.intro
-          | Tmp

      article
-        h3: a(href="#") Header
+        +declare_class("English")
+          p Load models into a callable object to process English text.

-        +declare_class("spacy.en.English")
-          +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")
+          +summary
+            +en_example
+
+          +init
+            p
+              | Load the resources.  Loading takes 20 seconds, and the instance
+              | consumes 2 to 3 gigabytes of memory.
+            
+            p 
+              | Intended use is for one instance to be created per process.
+              | You can create more if you're doing something unusual.
+            p
+              | You may wish to make the instance a global variable or "singleton".
+              | We usually instantiate the object in the <code>main()</code>
+              | function and pass it around as an explicit argument. 
+            +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")

              +params
                +param("data_dir")
@ -120,11 +175,11 @@ html(lang="en")
                +param("load_vectors")
                  | A boolean value to control whether the word vectors are loaded.
          
-
-          +method("__call__", "text, tag=True, parse=True, entity=True")(open)
+          +callable
+            +method("__call__", "text, tag=True, parse=True, entity=True")

              +params
-              +param("text", unicode_type)
+                +param("text", types.unicode)
                  | The text to be processed.  No pre-processing needs to be applied,
                  | and any length of text can be submitted.  Usually you will submit
                  | a whole document. Text may be zero-length. An exception is raised
@ -152,17 +207,22 @@ html(lang="en")
                    | # doc = nlp(b'Some text') <-- Error: need unicode
                    | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
    
-      +declare_class("spacy.tokens.doc.Doc")
+
+        +declare_class("Doc")
+          p I'm a doc
+
+          +init
            +method("__init__", "vocab")
              +params
                +param("vocab", vocab_type)
                  | A vocabulary object

-        +method("__getitem__", "i", int_type)
-          +returns(Token_type)
+          +sequence
+            +method("__getitem__", "i", types.int)
+              +returns(types.Token)

-        +method("__getitem__", "start_end", slice_type)
-          +returns(Span_type)
+            +method("__getitem__", "start_end", types.slice)
+              +returns(types.Span)
  
            +method("__iter__")
              | Iterate over tokens
@ -170,13 +230,19 @@ html(lang="en")
            +method("__len__")
              | Number of tokens in the document.
  
-        +attribute("sents", generator_type)
+          details
+            summary: h4 Spans
+            
+            +attribute("sents", types.generator)
              | Iterate over sentences in the document.
          
-        +attribute("ents", generator_type)
+            +attribute("ents", types.generator)
              | Iterate over named entities in the document.
    
-        +attribute("noun_chunks", generator_type)
+            +attribute("noun_chunks", types.generator)
+          
+          details
+            summary: h4 Export/Import
            
            +method("to_array", "attr_ids")
  
@ -184,7 +250,6 @@ html(lang="en")
              | of shape N*M, where N is the length of the sentence.
  
              +params
-
                +param("attr_ids", "list[int]")
                  | A list of attribute ID ints.
  
@ -193,7 +258,6 @@ html(lang="en")
                | indicated in the input attr_ids.
  
            +method("count_by", "attr_id")
-
              | Produce a dict of {attribute (int): count (ints)} frequencies, keyed
              | by the values of the given attribute ID.
            
@ -213,31 +277,29 @@ html(lang="en")
            +method("from_array", "attrs, array")
              | Load from array
          
-        +method("to_bytes")
-          | Serialize
-
            +method("from_bytes")
              | Deserialize, loading from bytes
  
            +method("read_bytes")
              | classmethod
  
-        +method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
+            //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
  
-          | Merge a multi-word expression into a single token.  Currently
-          | experimental; API is likely to change.
+            //  | Merge a multi-word expression into a single token.  Currently
+            //  | experimental; API is likely to change.
  
  
-      +declare_class("spacy.tokens.Token")
+        +declare_class("Token")
+          +init
            +method("__init__", "vocab, doc, offset")
              +params
-            +param("vocab", Vocab_type)
+                +param("vocab", types.Vocab)
                  p A Vocab object
  
-            +param("doc", Doc_type)
+                +param("doc", types.Doc)
                  p The parent sequence
  
-          +param("offset", Int_type)
+              +param("offset", types.int)
                p The index of the token within the document
  
          details
@ -336,11 +398,13 @@ html(lang="en")
            summary: h4 Syntactic Tags
    
            +attribute("pos / pos_")
+              p
                | A part-of-speech tag, from the Google Universal Tag Set, e.g. 
                | code>NOUN</code>, <code>VERB</code>, <code>ADV</code>.  Constants for
                | the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
    
            +attribute("tag / tag_")
+              p
                | A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
                | <code>DT</code>, etc.  These tags are language/corpus specific, and
                | typically describe part-of-speech and some amount of morphological
@ -348,6 +412,7 @@ html(lang="en")
                | is assigned to a present-tense singular verb.
    
            +attribute("dep / dep_")
+              p
                | The type of syntactic dependency relation between the word and its
                | syntactic head.
    
@ -426,8 +491,14 @@ html(lang="en")
            //+attribute("conjuncts")
            //  | Conjuncts
    
-      +declare_class("spacy.tokens.span.Span")
-        +params
+        +declare_class("Span")
+          +init
+            +method("__init__")
+              Temp
+  
+            <code>span = doc[0:4]</code>
+  
+          +sequence
            +method("__getitem__")
              p Get item
  
@ -437,6 +508,9 @@ html(lang="en")
            +method("__len__")
              p Len
  
+          details
+            summary: h4 Parse
+  
            +attribute("root")
              p Syntactic head
  
@ -464,6 +538,13 @@ html(lang="en")
                  | rights = [span.doc[i] for i in range(span.end, len(span.doc))
                  |           if span.doc[i].head in span]
  
+  
+            +attribute("subtree")
+              p String
+  
+          details
+            summary: h4 String Views
+  
            +attribute("string")
              p String
    
@ -473,14 +554,61 @@ html(lang="en")
            +attribute("label / label_")
              p String
  
-          +attribute("subtree")
-            p String
+        +declare_class("Lexeme")
+          p
+            | The Lexeme object represents a lexical type, stored in the vocabulary
+            | &ndash; as opposed to a token, occurring in a document.
+          p
+            | Lexemes store various features, so that these features can be computed
+            | once per type, rather than once per token. As job sizes grow, this
+            | can amount to a substantial efficiency improvement.
  
-      +declare_class("spacy.vocab.Vocab", "data_dir=None, lex_props_getter=None")
+          p
+            | All Lexeme attributes are therefore context independent, as a single
+            | lexeme is reused for all usages of that word. Lexemes are keyed by
+            | the “orth” attribute.
+  
+          p
+            All Lexeme attributes are accessible directly on the Token object.
+  
+          +init
+            +method("__init__")
+              p Init
+  
+            details
+              summary: h4 String Features
+  
+                +attribute("orth / orth_")
+                  p
+                    | The form of the word with no string normalization or processing,
+                    | as it appears in the string, without trailing whitespace.
+              
+                +attribute("lower / lower_")
+                  p Tmp
+              
+                +attribute("norm / norm_")
+                  p Tmp
+              
+                +attribute("shape / shape_")
+                  p Tmp
+              
+                +attribute("prefix / prefix_")
+                  p Tmp
+              
+                +attribute("suffix / suffix_")
+                  p TMP
+  
+        +declare_class("Vocab", "data_dir=None, lex_props_getter=None")
+          +sequence
            +method("__len__")
              +returns
                p Number of words in the vocabulary.
  
+            +method("__iter__")
+              +returns
+                p Lexeme
+    
+          +maptype
            +method("__getitem__", "key_int")
              +params
                +param("key")
@ -490,48 +618,59 @@ html(lang="en")
    
            +method("__getitem__", "key_str")
              +params
-            +param("key_str", unicode_type)
+                +param("key_str", types.unicode)
                  p A string in the vocabulary
    
              +returns("Lexeme")
    
            +method("__setitem__", "orth_str", "props")
              +params
-            +param("orth_str", unicode_type)
+                +param("orth_str", types.unicode)
                  p The orth key
    
-            +param("props", dict_type)
+                +param("props", types.dict)
                  p A props dictionary
    
              +returns("None")
  
+          details
+            summary: h4 Import/Export
+    
            +method("dump", "loc")
              +params
-            +param("loc", unicode_type)
+                +param("loc", types.unicode)
                  p Path where the vocabulary should be saved
    
            +method("load_lexemes", "loc")
            +params
-            +param("loc", unicode_type)
+              +param("loc", types.unicode)
                p Path to load the lexemes.bin file from
    
            +method("load_vectors", "loc")
              +params
-            +param("loc", unicode_type)
+                +param("loc", types.unicode)
                  p Path to load the vectors.bin from
  
+        +declare_class("StringStore")
+          +init
+            Tmp
  
-      +declare_class("spacy.strings.StringStore")
+          +sequence
            +method("__len__")
              +returns("int")
                p Number of strings in the string-store
  
+            +method("__iter__")
+              +returns
+                p Lexeme
+  
+          +maptype
            +method("__getitem__", "key_int")
              +params
                +param("key_int")
                  p An integer key
    
-          +returns(unicode_type)
+              +returns(types.unicode)
                p The string that the integer key maps to
    
            +method("__getitem__", "key_unicode")
@ -539,17 +678,20 @@ html(lang="en")
                +param("key_unicode")
                  p A key, as a unicode string
    
-          +returns(int_type)
+              +returns(types.int)
                p The integer ID of the string.
    
            +method("__getitem__", "key_utf8_bytes")
              +params
-            +param("key_utf8_bytes", bytes_type)
+                +param("key_utf8_bytes", types.bytes)
                  p p A key, as a UTF-8 encoded byte-string
    
-          +returns(int_type)
+              +returns(types.int)
                p The integer ID of the string.
  
+          details
+            summary: h4 Import/Export
+    
            +method("dump", "loc")
              +params
                +param("loc")