* Work on documentation. Have overall structure now

This commit is contained in:
Matthew Honnibal 2015-08-12 20:21:26 +02:00
parent ab39f358c1
commit c767ab9fdf

View File

@ -1,17 +1,19 @@
- var unicode_type = '<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>'
- var bool_type = '<a class="reference" href="http://docs.python.org/library/functions.html#bool"><em>bool</em></a>'
- var int_type = ""
- var Token_type = ""
- var Span_type = ""
- var Vocab_type = ""
- var generator_type = ""
- var py_docs = '<a class="reference" href="http://docs.python.org/library/'
-
var types = {
'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
'int': py_docs + 'functions.html#int"><em>int</em></a>',
'generator': "",
'Vocab': "",
'Span': "",
'Doc': ""
}
mixin declare_class(name)
details(open="true")
details
summary
span.declaration
span.label class
@ -62,14 +64,54 @@ mixin returns(name, type, value)
mixin returns(type)
| tmp
mixin init
details
summary: h4 Init
block
mixin callable
details
summary: h4 Callable
block
mixin sequence
details
summary: h4 Sequence
block
mixin maptype
details
summary: h4 Map
block
mixin summary
block
mixin en_example
pre.language-python
code
| from spacy.en import English
| from spacy._doc_examples import download_war_and_peace
|
| unprocessed_unicode = download_war_and_peace()
|
| nlp = English()
| doc = nlp(unprocessed_unicode)
doctype html
html(lang="en")
head
meta(charset="utf-8")
title!= tag_line
title spaCy &ndash; Industrial-strength NLP
meta(name="description" content="")
meta(name="author" content="Matthew Honnibal")
link(rel="stylesheet" href="css/style.css")
@ -78,9 +120,9 @@ html(lang="en")
<![endif]-->
body(id="docs")
header
h1.logo!= tag_line
div.slogan!= slogan
header(role="banner")
h1.logo spaCy &ndash; Industrial-strength NLP
div.slogan API
nav(role="navigation")
@ -91,14 +133,27 @@ html(lang="en")
li: a(href="#") Blog
main.docs#content
section.intro
| Tmp
article
h3: a(href="#") Header
+declare_class("English")
p Load models into a callable object to process English text.
+declare_class("spacy.en.English")
+method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")
+summary
+en_example
+init
p
| Load the resources. Loading takes 20 seconds, and the instance
| consumes 2 to 3 gigabytes of memory.
p
| Intended use is for one instance to be created per process.
| You can create more if you're doing something unusual.
p
| You may wish to make the instance a global variable or "singleton".
| We usually instantiate the object in the <code>main()</code>
| function and pass it around as an explicit argument.
+method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
+params
+param("data_dir")
@ -120,11 +175,11 @@ html(lang="en")
+param("load_vectors")
| A boolean value to control whether the word vectors are loaded.
+method("__call__", "text, tag=True, parse=True, entity=True")(open)
+callable
+method("__call__", "text, tag=True, parse=True, entity=True")
+params
+param("text", unicode_type)
+param("text", types.unicode)
| The text to be processed. No pre-processing needs to be applied,
| and any length of text can be submitted. Usually you will submit
| a whole document. Text may be zero-length. An exception is raised
@ -152,17 +207,22 @@ html(lang="en")
| # doc = nlp(b'Some text') <-- Error: need unicode
| doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
+declare_class("spacy.tokens.doc.Doc")
+declare_class("Doc")
p I'm a doc
+init
+method("__init__", "vocab")
+params
+param("vocab", vocab_type)
| A vocabulary object
+method("__getitem__", "i", int_type)
+returns(Token_type)
+sequence
+method("__getitem__", "i", types.int)
+returns(types.Token)
+method("__getitem__", "start_end", slice_type)
+returns(Span_type)
+method("__getitem__", "start_end", types.slice)
+returns(types.Span)
+method("__iter__")
| Iterate over tokens
@ -170,13 +230,19 @@ html(lang="en")
+method("__len__")
| Number of tokens in the document.
+attribute("sents", generator_type)
details
summary: h4 Spans
+attribute("sents", types.generator)
| Iterate over sentences in the document.
+attribute("ents", generator_type)
+attribute("ents", types.generator)
| Iterate over named entities in the document.
+attribute("noun_chunks", generator_type)
+attribute("noun_chunks", types.generator)
details
summary: h4 Export/Import
+method("to_array", "attr_ids")
@ -184,7 +250,6 @@ html(lang="en")
| of shape N*M, where N is the length of the sentence.
+params
+param("attr_ids", "list[int]")
| A list of attribute ID ints.
@ -193,7 +258,6 @@ html(lang="en")
| indicated in the input attr_ids.
+method("count_by", "attr_id")
| Produce a dict of {attribute (int): count (ints)} frequencies, keyed
| by the values of the given attribute ID.
@ -213,31 +277,29 @@ html(lang="en")
+method("from_array", "attrs, array")
| Load from array
+method("to_bytes")
| Serialize
+method("from_bytes")
| Deserialize, loading from bytes
+method("read_bytes")
| classmethod
+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
//+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
| Merge a multi-word expression into a single token. Currently
| experimental; API is likely to change.
// | Merge a multi-word expression into a single token. Currently
// | experimental; API is likely to change.
+declare_class("spacy.tokens.Token")
+declare_class("Token")
+init
+method("__init__", "vocab, doc, offset")
+params
+param("vocab", Vocab_type)
+param("vocab", types.Vocab)
p A Vocab object
+param("doc", Doc_type)
+param("doc", types.Doc)
p The parent sequence
+param("offset", Int_type)
+param("offset", types.int)
p The index of the token within the document
details
@ -336,11 +398,13 @@ html(lang="en")
summary: h4 Syntactic Tags
+attribute("pos / pos_")
p
| A part-of-speech tag, from the Google Universal Tag Set, e.g.
| code>NOUN</code>, <code>VERB</code>, <code>ADV</code>. Constants for
| the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
+attribute("tag / tag_")
p
| A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
| <code>DT</code>, etc. These tags are language/corpus specific, and
| typically describe part-of-speech and some amount of morphological
@ -348,6 +412,7 @@ html(lang="en")
| is assigned to a present-tense singular verb.
+attribute("dep / dep_")
p
| The type of syntactic dependency relation between the word and its
| syntactic head.
@ -426,8 +491,14 @@ html(lang="en")
//+attribute("conjuncts")
// | Conjuncts
+declare_class("spacy.tokens.span.Span")
+params
+declare_class("Span")
+init
+method("__init__")
Temp
<code>span = doc[0:4]</code>
+sequence
+method("__getitem__")
p Get item
@ -437,6 +508,9 @@ html(lang="en")
+method("__len__")
p Len
details
summary: h4 Parse
+attribute("root")
p Syntactic head
@ -464,6 +538,13 @@ html(lang="en")
| rights = [span.doc[i] for i in range(span.end, len(span.doc))
| if span.doc[i].head in span]
+attribute("subtree")
p String
details
summary: h4 String Views
+attribute("string")
p String
@ -473,14 +554,61 @@ html(lang="en")
+attribute("label / label_")
p String
+attribute("subtree")
p String
+declare_class("Lexeme")
p
| The Lexeme object represents a lexical type, stored in the vocabulary
| &ndash; as opposed to a token, occurring in a document.
p
| Lexemes store various features, so that these features can be computed
| once per type, rather than once per token. As job sizes grow, this
| can amount to a substantial efficiency improvement.
+declare_class("spacy.vocab.Vocab", "data_dir=None, lex_props_getter=None")
p
| All Lexeme attributes are therefore context independent, as a single
| lexeme is reused for all usages of that word. Lexemes are keyed by
| the “orth” attribute.
p
All Lexeme attributes are accessible directly on the Token object.
+init
+method("__init__")
p Init
details
summary: h4 String Features
+attribute("orth / orth_")
p
| The form of the word with no string normalization or processing,
| as it appears in the string, without trailing whitespace.
+attribute("lower / lower_")
p Tmp
+attribute("norm / norm_")
p Tmp
+attribute("shape / shape_")
p Tmp
+attribute("prefix / prefix_")
p Tmp
+attribute("suffix / suffix_")
p TMP
+declare_class("Vocab", "data_dir=None, lex_props_getter=None")
+sequence
+method("__len__")
+returns
p Number of words in the vocabulary.
+method("__iter__")
+returns
p Lexeme
+maptype
+method("__getitem__", "key_int")
+params
+param("key")
@ -490,48 +618,59 @@ html(lang="en")
+method("__getitem__", "key_str")
+params
+param("key_str", unicode_type)
+param("key_str", types.unicode)
p A string in the vocabulary
+returns("Lexeme")
+method("__setitem__", "orth_str", "props")
+params
+param("orth_str", unicode_type)
+param("orth_str", types.unicode)
p The orth key
+param("props", dict_type)
+param("props", types.dict)
p A props dictionary
+returns("None")
details
summary: h4 Import/Export
+method("dump", "loc")
+params
+param("loc", unicode_type)
+param("loc", types.unicode)
p Path where the vocabulary should be saved
+method("load_lexemes", "loc")
+params
+param("loc", unicode_type)
+param("loc", types.unicode)
p Path to load the lexemes.bin file from
+method("load_vectors", "loc")
+params
+param("loc", unicode_type)
+param("loc", types.unicode)
p Path to load the vectors.bin from
+declare_class("StringStore")
+init
Tmp
+declare_class("spacy.strings.StringStore")
+sequence
+method("__len__")
+returns("int")
p Number of strings in the string-store
+method("__iter__")
+returns
p Lexeme
+maptype
+method("__getitem__", "key_int")
+params
+param("key_int")
p An integer key
+returns(unicode_type)
+returns(types.unicode)
p The string that the integer key maps to
+method("__getitem__", "key_unicode")
@ -539,17 +678,20 @@ html(lang="en")
+param("key_unicode")
p A key, as a unicode string
+returns(int_type)
+returns(types.int)
p The integer ID of the string.
+method("__getitem__", "key_utf8_bytes")
+params
+param("key_utf8_bytes", bytes_type)
+param("key_utf8_bytes", types.bytes)
p p A key, as a UTF-8 encoded byte-string
+returns(int_type)
+returns(types.int)
p The integer ID of the string.
details
summary: h4 Import/Export
+method("dump", "loc")
+params
+param("loc")