mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 18:36:36 +03:00
* Work on documentation. Have overall structure now
This commit is contained in:
parent
ab39f358c1
commit
c767ab9fdf
|
@ -1,17 +1,19 @@
|
||||||
- var unicode_type = '<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>'
|
- var py_docs = '<a class="reference" href="http://docs.python.org/library/'
|
||||||
- var bool_type = '<a class="reference" href="http://docs.python.org/library/functions.html#bool"><em>bool</em></a>'
|
|
||||||
|
|
||||||
- var int_type = ""
|
|
||||||
|
|
||||||
- var Token_type = ""
|
|
||||||
- var Span_type = ""
|
|
||||||
- var Vocab_type = ""
|
|
||||||
- var generator_type = ""
|
|
||||||
|
|
||||||
|
-
|
||||||
|
var types = {
|
||||||
|
'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
|
||||||
|
'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
|
||||||
|
'int': py_docs + 'functions.html#int"><em>int</em></a>',
|
||||||
|
'generator': "",
|
||||||
|
'Vocab': "",
|
||||||
|
'Span': "",
|
||||||
|
'Doc': ""
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
mixin declare_class(name)
|
mixin declare_class(name)
|
||||||
details(open="true")
|
details
|
||||||
summary
|
summary
|
||||||
span.declaration
|
span.declaration
|
||||||
span.label class
|
span.label class
|
||||||
|
@ -62,14 +64,54 @@ mixin returns(name, type, value)
|
||||||
mixin returns(type)
|
mixin returns(type)
|
||||||
| tmp
|
| tmp
|
||||||
|
|
||||||
|
mixin init
|
||||||
|
details
|
||||||
|
summary: h4 Init
|
||||||
|
|
||||||
|
block
|
||||||
|
|
||||||
|
|
||||||
|
mixin callable
|
||||||
|
details
|
||||||
|
summary: h4 Callable
|
||||||
|
|
||||||
|
block
|
||||||
|
|
||||||
|
|
||||||
|
mixin sequence
|
||||||
|
details
|
||||||
|
summary: h4 Sequence
|
||||||
|
|
||||||
|
block
|
||||||
|
|
||||||
|
|
||||||
|
mixin maptype
|
||||||
|
details
|
||||||
|
summary: h4 Map
|
||||||
|
|
||||||
|
block
|
||||||
|
|
||||||
|
|
||||||
|
mixin summary
|
||||||
|
block
|
||||||
|
|
||||||
|
mixin en_example
|
||||||
|
pre.language-python
|
||||||
|
code
|
||||||
|
| from spacy.en import English
|
||||||
|
| from spacy._doc_examples import download_war_and_peace
|
||||||
|
|
|
||||||
|
| unprocessed_unicode = download_war_and_peace()
|
||||||
|
|
|
||||||
|
| nlp = English()
|
||||||
|
| doc = nlp(unprocessed_unicode)
|
||||||
|
|
||||||
|
|
||||||
doctype html
|
doctype html
|
||||||
html(lang="en")
|
html(lang="en")
|
||||||
head
|
head
|
||||||
meta(charset="utf-8")
|
meta(charset="utf-8")
|
||||||
title!= tag_line
|
title spaCy – Industrial-strength NLP
|
||||||
meta(name="description" content="")
|
meta(name="description" content="")
|
||||||
meta(name="author" content="Matthew Honnibal")
|
meta(name="author" content="Matthew Honnibal")
|
||||||
link(rel="stylesheet" href="css/style.css")
|
link(rel="stylesheet" href="css/style.css")
|
||||||
|
@ -78,9 +120,9 @@ html(lang="en")
|
||||||
<![endif]-->
|
<![endif]-->
|
||||||
|
|
||||||
body(id="docs")
|
body(id="docs")
|
||||||
header
|
header(role="banner")
|
||||||
h1.logo!= tag_line
|
h1.logo spaCy – Industrial-strength NLP
|
||||||
div.slogan!= slogan
|
div.slogan API
|
||||||
|
|
||||||
|
|
||||||
nav(role="navigation")
|
nav(role="navigation")
|
||||||
|
@ -91,14 +133,27 @@ html(lang="en")
|
||||||
li: a(href="#") Blog
|
li: a(href="#") Blog
|
||||||
|
|
||||||
main.docs#content
|
main.docs#content
|
||||||
section.intro
|
|
||||||
| Tmp
|
|
||||||
|
|
||||||
article
|
article
|
||||||
h3: a(href="#") Header
|
+declare_class("English")
|
||||||
|
p Load models into a callable object to process English text.
|
||||||
|
|
||||||
+declare_class("spacy.en.English")
|
+summary
|
||||||
+method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")
|
+en_example
|
||||||
|
|
||||||
|
+init
|
||||||
|
p
|
||||||
|
| Load the resources. Loading takes 20 seconds, and the instance
|
||||||
|
| consumes 2 to 3 gigabytes of memory.
|
||||||
|
|
||||||
|
p
|
||||||
|
| Intended use is for one instance to be created per process.
|
||||||
|
| You can create more if you're doing something unusual.
|
||||||
|
p
|
||||||
|
| You may wish to make the instance a global variable or "singleton".
|
||||||
|
| We usually instantiate the object in the <code>main()</code>
|
||||||
|
| function and pass it around as an explicit argument.
|
||||||
|
+method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
|
||||||
|
|
||||||
+params
|
+params
|
||||||
+param("data_dir")
|
+param("data_dir")
|
||||||
|
@ -120,11 +175,11 @@ html(lang="en")
|
||||||
+param("load_vectors")
|
+param("load_vectors")
|
||||||
| A boolean value to control whether the word vectors are loaded.
|
| A boolean value to control whether the word vectors are loaded.
|
||||||
|
|
||||||
|
+callable
|
||||||
+method("__call__", "text, tag=True, parse=True, entity=True")(open)
|
+method("__call__", "text, tag=True, parse=True, entity=True")
|
||||||
|
|
||||||
+params
|
+params
|
||||||
+param("text", unicode_type)
|
+param("text", types.unicode)
|
||||||
| The text to be processed. No pre-processing needs to be applied,
|
| The text to be processed. No pre-processing needs to be applied,
|
||||||
| and any length of text can be submitted. Usually you will submit
|
| and any length of text can be submitted. Usually you will submit
|
||||||
| a whole document. Text may be zero-length. An exception is raised
|
| a whole document. Text may be zero-length. An exception is raised
|
||||||
|
@ -152,17 +207,22 @@ html(lang="en")
|
||||||
| # doc = nlp(b'Some text') <-- Error: need unicode
|
| # doc = nlp(b'Some text') <-- Error: need unicode
|
||||||
| doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
| doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
||||||
|
|
||||||
+declare_class("spacy.tokens.doc.Doc")
|
|
||||||
|
+declare_class("Doc")
|
||||||
|
p I'm a doc
|
||||||
|
|
||||||
|
+init
|
||||||
+method("__init__", "vocab")
|
+method("__init__", "vocab")
|
||||||
+params
|
+params
|
||||||
+param("vocab", vocab_type)
|
+param("vocab", vocab_type)
|
||||||
| A vocabulary object
|
| A vocabulary object
|
||||||
|
|
||||||
+method("__getitem__", "i", int_type)
|
+sequence
|
||||||
+returns(Token_type)
|
+method("__getitem__", "i", types.int)
|
||||||
|
+returns(types.Token)
|
||||||
|
|
||||||
+method("__getitem__", "start_end", slice_type)
|
+method("__getitem__", "start_end", types.slice)
|
||||||
+returns(Span_type)
|
+returns(types.Span)
|
||||||
|
|
||||||
+method("__iter__")
|
+method("__iter__")
|
||||||
| Iterate over tokens
|
| Iterate over tokens
|
||||||
|
@ -170,13 +230,19 @@ html(lang="en")
|
||||||
+method("__len__")
|
+method("__len__")
|
||||||
| Number of tokens in the document.
|
| Number of tokens in the document.
|
||||||
|
|
||||||
+attribute("sents", generator_type)
|
details
|
||||||
|
summary: h4 Spans
|
||||||
|
|
||||||
|
+attribute("sents", types.generator)
|
||||||
| Iterate over sentences in the document.
|
| Iterate over sentences in the document.
|
||||||
|
|
||||||
+attribute("ents", generator_type)
|
+attribute("ents", types.generator)
|
||||||
| Iterate over named entities in the document.
|
| Iterate over named entities in the document.
|
||||||
|
|
||||||
+attribute("noun_chunks", generator_type)
|
+attribute("noun_chunks", types.generator)
|
||||||
|
|
||||||
|
details
|
||||||
|
summary: h4 Export/Import
|
||||||
|
|
||||||
+method("to_array", "attr_ids")
|
+method("to_array", "attr_ids")
|
||||||
|
|
||||||
|
@ -184,7 +250,6 @@ html(lang="en")
|
||||||
| of shape N*M, where N is the length of the sentence.
|
| of shape N*M, where N is the length of the sentence.
|
||||||
|
|
||||||
+params
|
+params
|
||||||
|
|
||||||
+param("attr_ids", "list[int]")
|
+param("attr_ids", "list[int]")
|
||||||
| A list of attribute ID ints.
|
| A list of attribute ID ints.
|
||||||
|
|
||||||
|
@ -193,7 +258,6 @@ html(lang="en")
|
||||||
| indicated in the input attr_ids.
|
| indicated in the input attr_ids.
|
||||||
|
|
||||||
+method("count_by", "attr_id")
|
+method("count_by", "attr_id")
|
||||||
|
|
||||||
| Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
| Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||||
| by the values of the given attribute ID.
|
| by the values of the given attribute ID.
|
||||||
|
|
||||||
|
@ -213,31 +277,29 @@ html(lang="en")
|
||||||
+method("from_array", "attrs, array")
|
+method("from_array", "attrs, array")
|
||||||
| Load from array
|
| Load from array
|
||||||
|
|
||||||
+method("to_bytes")
|
|
||||||
| Serialize
|
|
||||||
|
|
||||||
+method("from_bytes")
|
+method("from_bytes")
|
||||||
| Deserialize, loading from bytes
|
| Deserialize, loading from bytes
|
||||||
|
|
||||||
+method("read_bytes")
|
+method("read_bytes")
|
||||||
| classmethod
|
| classmethod
|
||||||
|
|
||||||
+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
|
//+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
|
||||||
|
|
||||||
| Merge a multi-word expression into a single token. Currently
|
// | Merge a multi-word expression into a single token. Currently
|
||||||
| experimental; API is likely to change.
|
// | experimental; API is likely to change.
|
||||||
|
|
||||||
|
|
||||||
+declare_class("spacy.tokens.Token")
|
+declare_class("Token")
|
||||||
|
+init
|
||||||
+method("__init__", "vocab, doc, offset")
|
+method("__init__", "vocab, doc, offset")
|
||||||
+params
|
+params
|
||||||
+param("vocab", Vocab_type)
|
+param("vocab", types.Vocab)
|
||||||
p A Vocab object
|
p A Vocab object
|
||||||
|
|
||||||
+param("doc", Doc_type)
|
+param("doc", types.Doc)
|
||||||
p The parent sequence
|
p The parent sequence
|
||||||
|
|
||||||
+param("offset", Int_type)
|
+param("offset", types.int)
|
||||||
p The index of the token within the document
|
p The index of the token within the document
|
||||||
|
|
||||||
details
|
details
|
||||||
|
@ -336,11 +398,13 @@ html(lang="en")
|
||||||
summary: h4 Syntactic Tags
|
summary: h4 Syntactic Tags
|
||||||
|
|
||||||
+attribute("pos / pos_")
|
+attribute("pos / pos_")
|
||||||
|
p
|
||||||
| A part-of-speech tag, from the Google Universal Tag Set, e.g.
|
| A part-of-speech tag, from the Google Universal Tag Set, e.g.
|
||||||
| code>NOUN</code>, <code>VERB</code>, <code>ADV</code>. Constants for
|
| code>NOUN</code>, <code>VERB</code>, <code>ADV</code>. Constants for
|
||||||
| the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
|
| the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
|
||||||
|
|
||||||
+attribute("tag / tag_")
|
+attribute("tag / tag_")
|
||||||
|
p
|
||||||
| A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
|
| A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
|
||||||
| <code>DT</code>, etc. These tags are language/corpus specific, and
|
| <code>DT</code>, etc. These tags are language/corpus specific, and
|
||||||
| typically describe part-of-speech and some amount of morphological
|
| typically describe part-of-speech and some amount of morphological
|
||||||
|
@ -348,6 +412,7 @@ html(lang="en")
|
||||||
| is assigned to a present-tense singular verb.
|
| is assigned to a present-tense singular verb.
|
||||||
|
|
||||||
+attribute("dep / dep_")
|
+attribute("dep / dep_")
|
||||||
|
p
|
||||||
| The type of syntactic dependency relation between the word and its
|
| The type of syntactic dependency relation between the word and its
|
||||||
| syntactic head.
|
| syntactic head.
|
||||||
|
|
||||||
|
@ -426,8 +491,14 @@ html(lang="en")
|
||||||
//+attribute("conjuncts")
|
//+attribute("conjuncts")
|
||||||
// | Conjuncts
|
// | Conjuncts
|
||||||
|
|
||||||
+declare_class("spacy.tokens.span.Span")
|
+declare_class("Span")
|
||||||
+params
|
+init
|
||||||
|
+method("__init__")
|
||||||
|
Temp
|
||||||
|
|
||||||
|
<code>span = doc[0:4]</code>
|
||||||
|
|
||||||
|
+sequence
|
||||||
+method("__getitem__")
|
+method("__getitem__")
|
||||||
p Get item
|
p Get item
|
||||||
|
|
||||||
|
@ -437,6 +508,9 @@ html(lang="en")
|
||||||
+method("__len__")
|
+method("__len__")
|
||||||
p Len
|
p Len
|
||||||
|
|
||||||
|
details
|
||||||
|
summary: h4 Parse
|
||||||
|
|
||||||
+attribute("root")
|
+attribute("root")
|
||||||
p Syntactic head
|
p Syntactic head
|
||||||
|
|
||||||
|
@ -464,6 +538,13 @@ html(lang="en")
|
||||||
| rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
| rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
||||||
| if span.doc[i].head in span]
|
| if span.doc[i].head in span]
|
||||||
|
|
||||||
|
|
||||||
|
+attribute("subtree")
|
||||||
|
p String
|
||||||
|
|
||||||
|
details
|
||||||
|
summary: h4 String Views
|
||||||
|
|
||||||
+attribute("string")
|
+attribute("string")
|
||||||
p String
|
p String
|
||||||
|
|
||||||
|
@ -473,14 +554,61 @@ html(lang="en")
|
||||||
+attribute("label / label_")
|
+attribute("label / label_")
|
||||||
p String
|
p String
|
||||||
|
|
||||||
+attribute("subtree")
|
+declare_class("Lexeme")
|
||||||
p String
|
p
|
||||||
|
| The Lexeme object represents a lexical type, stored in the vocabulary
|
||||||
|
| – as opposed to a token, occurring in a document.
|
||||||
|
p
|
||||||
|
| Lexemes store various features, so that these features can be computed
|
||||||
|
| once per type, rather than once per token. As job sizes grow, this
|
||||||
|
| can amount to a substantial efficiency improvement.
|
||||||
|
|
||||||
+declare_class("spacy.vocab.Vocab", "data_dir=None, lex_props_getter=None")
|
p
|
||||||
|
| All Lexeme attributes are therefore context independent, as a single
|
||||||
|
| lexeme is reused for all usages of that word. Lexemes are keyed by
|
||||||
|
| the “orth” attribute.
|
||||||
|
|
||||||
|
p
|
||||||
|
All Lexeme attributes are accessible directly on the Token object.
|
||||||
|
|
||||||
|
+init
|
||||||
|
+method("__init__")
|
||||||
|
p Init
|
||||||
|
|
||||||
|
details
|
||||||
|
summary: h4 String Features
|
||||||
|
|
||||||
|
+attribute("orth / orth_")
|
||||||
|
p
|
||||||
|
| The form of the word with no string normalization or processing,
|
||||||
|
| as it appears in the string, without trailing whitespace.
|
||||||
|
|
||||||
|
+attribute("lower / lower_")
|
||||||
|
p Tmp
|
||||||
|
|
||||||
|
+attribute("norm / norm_")
|
||||||
|
p Tmp
|
||||||
|
|
||||||
|
+attribute("shape / shape_")
|
||||||
|
p Tmp
|
||||||
|
|
||||||
|
+attribute("prefix / prefix_")
|
||||||
|
p Tmp
|
||||||
|
|
||||||
|
+attribute("suffix / suffix_")
|
||||||
|
p TMP
|
||||||
|
|
||||||
|
+declare_class("Vocab", "data_dir=None, lex_props_getter=None")
|
||||||
|
+sequence
|
||||||
+method("__len__")
|
+method("__len__")
|
||||||
+returns
|
+returns
|
||||||
p Number of words in the vocabulary.
|
p Number of words in the vocabulary.
|
||||||
|
|
||||||
|
+method("__iter__")
|
||||||
|
+returns
|
||||||
|
p Lexeme
|
||||||
|
|
||||||
|
+maptype
|
||||||
+method("__getitem__", "key_int")
|
+method("__getitem__", "key_int")
|
||||||
+params
|
+params
|
||||||
+param("key")
|
+param("key")
|
||||||
|
@ -490,48 +618,59 @@ html(lang="en")
|
||||||
|
|
||||||
+method("__getitem__", "key_str")
|
+method("__getitem__", "key_str")
|
||||||
+params
|
+params
|
||||||
+param("key_str", unicode_type)
|
+param("key_str", types.unicode)
|
||||||
p A string in the vocabulary
|
p A string in the vocabulary
|
||||||
|
|
||||||
+returns("Lexeme")
|
+returns("Lexeme")
|
||||||
|
|
||||||
+method("__setitem__", "orth_str", "props")
|
+method("__setitem__", "orth_str", "props")
|
||||||
+params
|
+params
|
||||||
+param("orth_str", unicode_type)
|
+param("orth_str", types.unicode)
|
||||||
p The orth key
|
p The orth key
|
||||||
|
|
||||||
+param("props", dict_type)
|
+param("props", types.dict)
|
||||||
p A props dictionary
|
p A props dictionary
|
||||||
|
|
||||||
+returns("None")
|
+returns("None")
|
||||||
|
|
||||||
|
details
|
||||||
|
summary: h4 Import/Export
|
||||||
|
|
||||||
+method("dump", "loc")
|
+method("dump", "loc")
|
||||||
+params
|
+params
|
||||||
+param("loc", unicode_type)
|
+param("loc", types.unicode)
|
||||||
p Path where the vocabulary should be saved
|
p Path where the vocabulary should be saved
|
||||||
|
|
||||||
+method("load_lexemes", "loc")
|
+method("load_lexemes", "loc")
|
||||||
+params
|
+params
|
||||||
+param("loc", unicode_type)
|
+param("loc", types.unicode)
|
||||||
p Path to load the lexemes.bin file from
|
p Path to load the lexemes.bin file from
|
||||||
|
|
||||||
+method("load_vectors", "loc")
|
+method("load_vectors", "loc")
|
||||||
+params
|
+params
|
||||||
+param("loc", unicode_type)
|
+param("loc", types.unicode)
|
||||||
p Path to load the vectors.bin from
|
p Path to load the vectors.bin from
|
||||||
|
|
||||||
|
+declare_class("StringStore")
|
||||||
|
+init
|
||||||
|
Tmp
|
||||||
|
|
||||||
+declare_class("spacy.strings.StringStore")
|
+sequence
|
||||||
+method("__len__")
|
+method("__len__")
|
||||||
+returns("int")
|
+returns("int")
|
||||||
p Number of strings in the string-store
|
p Number of strings in the string-store
|
||||||
|
|
||||||
|
+method("__iter__")
|
||||||
|
+returns
|
||||||
|
p Lexeme
|
||||||
|
|
||||||
|
+maptype
|
||||||
+method("__getitem__", "key_int")
|
+method("__getitem__", "key_int")
|
||||||
+params
|
+params
|
||||||
+param("key_int")
|
+param("key_int")
|
||||||
p An integer key
|
p An integer key
|
||||||
|
|
||||||
+returns(unicode_type)
|
+returns(types.unicode)
|
||||||
p The string that the integer key maps to
|
p The string that the integer key maps to
|
||||||
|
|
||||||
+method("__getitem__", "key_unicode")
|
+method("__getitem__", "key_unicode")
|
||||||
|
@ -539,17 +678,20 @@ html(lang="en")
|
||||||
+param("key_unicode")
|
+param("key_unicode")
|
||||||
p A key, as a unicode string
|
p A key, as a unicode string
|
||||||
|
|
||||||
+returns(int_type)
|
+returns(types.int)
|
||||||
p The integer ID of the string.
|
p The integer ID of the string.
|
||||||
|
|
||||||
+method("__getitem__", "key_utf8_bytes")
|
+method("__getitem__", "key_utf8_bytes")
|
||||||
+params
|
+params
|
||||||
+param("key_utf8_bytes", bytes_type)
|
+param("key_utf8_bytes", types.bytes)
|
||||||
p p A key, as a UTF-8 encoded byte-string
|
p p A key, as a UTF-8 encoded byte-string
|
||||||
|
|
||||||
+returns(int_type)
|
+returns(types.int)
|
||||||
p The integer ID of the string.
|
p The integer ID of the string.
|
||||||
|
|
||||||
|
details
|
||||||
|
summary: h4 Import/Export
|
||||||
|
|
||||||
+method("dump", "loc")
|
+method("dump", "loc")
|
||||||
+params
|
+params
|
||||||
+param("loc")
|
+param("loc")
|
||||||
|
|
Loading…
Reference in New Issue
Block a user