mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Work on documentation. Have overall structure now
This commit is contained in:
parent
ab39f358c1
commit
c767ab9fdf
|
@ -1,17 +1,19 @@
|
|||
- var unicode_type = '<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>'
|
||||
- var bool_type = '<a class="reference" href="http://docs.python.org/library/functions.html#bool"><em>bool</em></a>'
|
||||
|
||||
- var int_type = ""
|
||||
|
||||
- var Token_type = ""
|
||||
- var Span_type = ""
|
||||
- var Vocab_type = ""
|
||||
- var generator_type = ""
|
||||
- var py_docs = '<a class="reference" href="http://docs.python.org/library/'
|
||||
|
||||
-
|
||||
var types = {
|
||||
'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
|
||||
'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
|
||||
'int': py_docs + 'functions.html#int"><em>int</em></a>',
|
||||
'generator': "",
|
||||
'Vocab': "",
|
||||
'Span': "",
|
||||
'Doc': ""
|
||||
}
|
||||
|
||||
|
||||
mixin declare_class(name)
|
||||
details(open="true")
|
||||
details
|
||||
summary
|
||||
span.declaration
|
||||
span.label class
|
||||
|
@ -62,14 +64,54 @@ mixin returns(name, type, value)
|
|||
mixin returns(type)
|
||||
| tmp
|
||||
|
||||
mixin init
|
||||
details
|
||||
summary: h4 Init
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin callable
|
||||
details
|
||||
summary: h4 Callable
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin sequence
|
||||
details
|
||||
summary: h4 Sequence
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin maptype
|
||||
details
|
||||
summary: h4 Map
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin summary
|
||||
block
|
||||
|
||||
mixin en_example
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| from spacy._doc_examples import download_war_and_peace
|
||||
|
|
||||
| unprocessed_unicode = download_war_and_peace()
|
||||
|
|
||||
| nlp = English()
|
||||
| doc = nlp(unprocessed_unicode)
|
||||
|
||||
|
||||
doctype html
|
||||
html(lang="en")
|
||||
head
|
||||
meta(charset="utf-8")
|
||||
title!= tag_line
|
||||
title spaCy – Industrial-strength NLP
|
||||
meta(name="description" content="")
|
||||
meta(name="author" content="Matthew Honnibal")
|
||||
link(rel="stylesheet" href="css/style.css")
|
||||
|
@ -78,9 +120,9 @@ html(lang="en")
|
|||
<![endif]-->
|
||||
|
||||
body(id="docs")
|
||||
header
|
||||
h1.logo!= tag_line
|
||||
div.slogan!= slogan
|
||||
header(role="banner")
|
||||
h1.logo spaCy – Industrial-strength NLP
|
||||
div.slogan API
|
||||
|
||||
|
||||
nav(role="navigation")
|
||||
|
@ -91,14 +133,27 @@ html(lang="en")
|
|||
li: a(href="#") Blog
|
||||
|
||||
main.docs#content
|
||||
section.intro
|
||||
| Tmp
|
||||
|
||||
article
|
||||
h3: a(href="#") Header
|
||||
+declare_class("English")
|
||||
p Load models into a callable object to process English text.
|
||||
|
||||
+declare_class("spacy.en.English")
|
||||
+method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")
|
||||
+summary
|
||||
+en_example
|
||||
|
||||
+init
|
||||
p
|
||||
| Load the resources. Loading takes 20 seconds, and the instance
|
||||
| consumes 2 to 3 gigabytes of memory.
|
||||
|
||||
p
|
||||
| Intended use is for one instance to be created per process.
|
||||
| You can create more if you're doing something unusual.
|
||||
p
|
||||
| You may wish to make the instance a global variable or "singleton".
|
||||
| We usually instantiate the object in the <code>main()</code>
|
||||
| function and pass it around as an explicit argument.
|
||||
+method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
|
||||
|
||||
+params
|
||||
+param("data_dir")
|
||||
|
@ -120,11 +175,11 @@ html(lang="en")
|
|||
+param("load_vectors")
|
||||
| A boolean value to control whether the word vectors are loaded.
|
||||
|
||||
|
||||
+method("__call__", "text, tag=True, parse=True, entity=True")(open)
|
||||
+callable
|
||||
+method("__call__", "text, tag=True, parse=True, entity=True")
|
||||
|
||||
+params
|
||||
+param("text", unicode_type)
|
||||
+param("text", types.unicode)
|
||||
| The text to be processed. No pre-processing needs to be applied,
|
||||
| and any length of text can be submitted. Usually you will submit
|
||||
| a whole document. Text may be zero-length. An exception is raised
|
||||
|
@ -152,17 +207,22 @@ html(lang="en")
|
|||
| # doc = nlp(b'Some text') <-- Error: need unicode
|
||||
| doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
||||
|
||||
+declare_class("spacy.tokens.doc.Doc")
|
||||
|
||||
+declare_class("Doc")
|
||||
p I'm a doc
|
||||
|
||||
+init
|
||||
+method("__init__", "vocab")
|
||||
+params
|
||||
+param("vocab", vocab_type)
|
||||
| A vocabulary object
|
||||
|
||||
+method("__getitem__", "i", int_type)
|
||||
+returns(Token_type)
|
||||
+sequence
|
||||
+method("__getitem__", "i", types.int)
|
||||
+returns(types.Token)
|
||||
|
||||
+method("__getitem__", "start_end", slice_type)
|
||||
+returns(Span_type)
|
||||
+method("__getitem__", "start_end", types.slice)
|
||||
+returns(types.Span)
|
||||
|
||||
+method("__iter__")
|
||||
| Iterate over tokens
|
||||
|
@ -170,13 +230,19 @@ html(lang="en")
|
|||
+method("__len__")
|
||||
| Number of tokens in the document.
|
||||
|
||||
+attribute("sents", generator_type)
|
||||
details
|
||||
summary: h4 Spans
|
||||
|
||||
+attribute("sents", types.generator)
|
||||
| Iterate over sentences in the document.
|
||||
|
||||
+attribute("ents", generator_type)
|
||||
+attribute("ents", types.generator)
|
||||
| Iterate over named entities in the document.
|
||||
|
||||
+attribute("noun_chunks", generator_type)
|
||||
+attribute("noun_chunks", types.generator)
|
||||
|
||||
details
|
||||
summary: h4 Export/Import
|
||||
|
||||
+method("to_array", "attr_ids")
|
||||
|
||||
|
@ -184,7 +250,6 @@ html(lang="en")
|
|||
| of shape N*M, where N is the length of the sentence.
|
||||
|
||||
+params
|
||||
|
||||
+param("attr_ids", "list[int]")
|
||||
| A list of attribute ID ints.
|
||||
|
||||
|
@ -193,7 +258,6 @@ html(lang="en")
|
|||
| indicated in the input attr_ids.
|
||||
|
||||
+method("count_by", "attr_id")
|
||||
|
||||
| Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||
| by the values of the given attribute ID.
|
||||
|
||||
|
@ -213,31 +277,29 @@ html(lang="en")
|
|||
+method("from_array", "attrs, array")
|
||||
| Load from array
|
||||
|
||||
+method("to_bytes")
|
||||
| Serialize
|
||||
|
||||
+method("from_bytes")
|
||||
| Deserialize, loading from bytes
|
||||
|
||||
+method("read_bytes")
|
||||
| classmethod
|
||||
|
||||
+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
|
||||
//+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
|
||||
|
||||
| Merge a multi-word expression into a single token. Currently
|
||||
| experimental; API is likely to change.
|
||||
// | Merge a multi-word expression into a single token. Currently
|
||||
// | experimental; API is likely to change.
|
||||
|
||||
|
||||
+declare_class("spacy.tokens.Token")
|
||||
+declare_class("Token")
|
||||
+init
|
||||
+method("__init__", "vocab, doc, offset")
|
||||
+params
|
||||
+param("vocab", Vocab_type)
|
||||
+param("vocab", types.Vocab)
|
||||
p A Vocab object
|
||||
|
||||
+param("doc", Doc_type)
|
||||
+param("doc", types.Doc)
|
||||
p The parent sequence
|
||||
|
||||
+param("offset", Int_type)
|
||||
+param("offset", types.int)
|
||||
p The index of the token within the document
|
||||
|
||||
details
|
||||
|
@ -336,11 +398,13 @@ html(lang="en")
|
|||
summary: h4 Syntactic Tags
|
||||
|
||||
+attribute("pos / pos_")
|
||||
p
|
||||
| A part-of-speech tag, from the Google Universal Tag Set, e.g.
|
||||
| code>NOUN</code>, <code>VERB</code>, <code>ADV</code>. Constants for
|
||||
| the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
|
||||
|
||||
+attribute("tag / tag_")
|
||||
p
|
||||
| A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
|
||||
| <code>DT</code>, etc. These tags are language/corpus specific, and
|
||||
| typically describe part-of-speech and some amount of morphological
|
||||
|
@ -348,6 +412,7 @@ html(lang="en")
|
|||
| is assigned to a present-tense singular verb.
|
||||
|
||||
+attribute("dep / dep_")
|
||||
p
|
||||
| The type of syntactic dependency relation between the word and its
|
||||
| syntactic head.
|
||||
|
||||
|
@ -426,8 +491,14 @@ html(lang="en")
|
|||
//+attribute("conjuncts")
|
||||
// | Conjuncts
|
||||
|
||||
+declare_class("spacy.tokens.span.Span")
|
||||
+params
|
||||
+declare_class("Span")
|
||||
+init
|
||||
+method("__init__")
|
||||
Temp
|
||||
|
||||
<code>span = doc[0:4]</code>
|
||||
|
||||
+sequence
|
||||
+method("__getitem__")
|
||||
p Get item
|
||||
|
||||
|
@ -437,6 +508,9 @@ html(lang="en")
|
|||
+method("__len__")
|
||||
p Len
|
||||
|
||||
details
|
||||
summary: h4 Parse
|
||||
|
||||
+attribute("root")
|
||||
p Syntactic head
|
||||
|
||||
|
@ -464,6 +538,13 @@ html(lang="en")
|
|||
| rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
||||
| if span.doc[i].head in span]
|
||||
|
||||
|
||||
+attribute("subtree")
|
||||
p String
|
||||
|
||||
details
|
||||
summary: h4 String Views
|
||||
|
||||
+attribute("string")
|
||||
p String
|
||||
|
||||
|
@ -473,14 +554,61 @@ html(lang="en")
|
|||
+attribute("label / label_")
|
||||
p String
|
||||
|
||||
+attribute("subtree")
|
||||
p String
|
||||
+declare_class("Lexeme")
|
||||
p
|
||||
| The Lexeme object represents a lexical type, stored in the vocabulary
|
||||
| – as opposed to a token, occurring in a document.
|
||||
p
|
||||
| Lexemes store various features, so that these features can be computed
|
||||
| once per type, rather than once per token. As job sizes grow, this
|
||||
| can amount to a substantial efficiency improvement.
|
||||
|
||||
+declare_class("spacy.vocab.Vocab", "data_dir=None, lex_props_getter=None")
|
||||
p
|
||||
| All Lexeme attributes are therefore context independent, as a single
|
||||
| lexeme is reused for all usages of that word. Lexemes are keyed by
|
||||
| the “orth” attribute.
|
||||
|
||||
p
|
||||
All Lexeme attributes are accessible directly on the Token object.
|
||||
|
||||
+init
|
||||
+method("__init__")
|
||||
p Init
|
||||
|
||||
details
|
||||
summary: h4 String Features
|
||||
|
||||
+attribute("orth / orth_")
|
||||
p
|
||||
| The form of the word with no string normalization or processing,
|
||||
| as it appears in the string, without trailing whitespace.
|
||||
|
||||
+attribute("lower / lower_")
|
||||
p Tmp
|
||||
|
||||
+attribute("norm / norm_")
|
||||
p Tmp
|
||||
|
||||
+attribute("shape / shape_")
|
||||
p Tmp
|
||||
|
||||
+attribute("prefix / prefix_")
|
||||
p Tmp
|
||||
|
||||
+attribute("suffix / suffix_")
|
||||
p TMP
|
||||
|
||||
+declare_class("Vocab", "data_dir=None, lex_props_getter=None")
|
||||
+sequence
|
||||
+method("__len__")
|
||||
+returns
|
||||
p Number of words in the vocabulary.
|
||||
|
||||
+method("__iter__")
|
||||
+returns
|
||||
p Lexeme
|
||||
|
||||
+maptype
|
||||
+method("__getitem__", "key_int")
|
||||
+params
|
||||
+param("key")
|
||||
|
@ -490,48 +618,59 @@ html(lang="en")
|
|||
|
||||
+method("__getitem__", "key_str")
|
||||
+params
|
||||
+param("key_str", unicode_type)
|
||||
+param("key_str", types.unicode)
|
||||
p A string in the vocabulary
|
||||
|
||||
+returns("Lexeme")
|
||||
|
||||
+method("__setitem__", "orth_str", "props")
|
||||
+params
|
||||
+param("orth_str", unicode_type)
|
||||
+param("orth_str", types.unicode)
|
||||
p The orth key
|
||||
|
||||
+param("props", dict_type)
|
||||
+param("props", types.dict)
|
||||
p A props dictionary
|
||||
|
||||
+returns("None")
|
||||
|
||||
details
|
||||
summary: h4 Import/Export
|
||||
|
||||
+method("dump", "loc")
|
||||
+params
|
||||
+param("loc", unicode_type)
|
||||
+param("loc", types.unicode)
|
||||
p Path where the vocabulary should be saved
|
||||
|
||||
+method("load_lexemes", "loc")
|
||||
+params
|
||||
+param("loc", unicode_type)
|
||||
+param("loc", types.unicode)
|
||||
p Path to load the lexemes.bin file from
|
||||
|
||||
+method("load_vectors", "loc")
|
||||
+params
|
||||
+param("loc", unicode_type)
|
||||
+param("loc", types.unicode)
|
||||
p Path to load the vectors.bin from
|
||||
|
||||
+declare_class("StringStore")
|
||||
+init
|
||||
Tmp
|
||||
|
||||
+declare_class("spacy.strings.StringStore")
|
||||
+sequence
|
||||
+method("__len__")
|
||||
+returns("int")
|
||||
p Number of strings in the string-store
|
||||
|
||||
+method("__iter__")
|
||||
+returns
|
||||
p Lexeme
|
||||
|
||||
+maptype
|
||||
+method("__getitem__", "key_int")
|
||||
+params
|
||||
+param("key_int")
|
||||
p An integer key
|
||||
|
||||
+returns(unicode_type)
|
||||
+returns(types.unicode)
|
||||
p The string that the integer key maps to
|
||||
|
||||
+method("__getitem__", "key_unicode")
|
||||
|
@ -539,17 +678,20 @@ html(lang="en")
|
|||
+param("key_unicode")
|
||||
p A key, as a unicode string
|
||||
|
||||
+returns(int_type)
|
||||
+returns(types.int)
|
||||
p The integer ID of the string.
|
||||
|
||||
+method("__getitem__", "key_utf8_bytes")
|
||||
+params
|
||||
+param("key_utf8_bytes", bytes_type)
|
||||
+param("key_utf8_bytes", types.bytes)
|
||||
p p A key, as a UTF-8 encoded byte-string
|
||||
|
||||
+returns(int_type)
|
||||
+returns(types.int)
|
||||
p The integer ID of the string.
|
||||
|
||||
details
|
||||
summary: h4 Import/Export
|
||||
|
||||
+method("dump", "loc")
|
||||
+params
|
||||
+param("loc")
|
||||
|
|
Loading…
Reference in New Issue
Block a user