2015-08-12 08:24:37 +02:00

564 lines
17 KiB

- var unicode_type = '<a class="reference" href=""><em>unicode</em></a>'
- var bool_type = '<a class="reference" href=""><em>bool</em></a>'
- var int_type = ""
- var Token_type = ""
- var Span_type = ""
- var Vocab_type = ""
- var generator_type = ""
mixin declare_class(name)
span.label class
code #{name}
mixin method(name, parameters)
span.label #{name}
| self, #{parameters}
mixin params
mixin param(name, type, value)
if type
<strong>#{name}</strong> (!{type}) &#8211;
<strong>#{name}</strong> &#8211;
mixin attribute(name, type, value)
span.label #{name}
mixin returns(name, type, value)
if type
<strong>#{name}</strong> (!{type}) &#8211;
<strong>#{name}</strong> &#8211;
mixin returns(type)
| tmp
doctype html
title!= tag_line
meta(name="description" content="")
meta(name="author" content="Matthew Honnibal")
link(rel="stylesheet" href="css/style.css")
<!--[if lt IE 9]>
h1.logo!= tag_line
div.slogan!= slogan
li: a(href="#") Home a(href="#") Docs
li: a(href="#") License
li: a(href="#") Blog
| Tmp
h3: a(href="#") Header
+method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")
| The data directory. May be #{None}, to disable any data loading
| (including the vocabulary).
| A class/function that creates the tokenizer.
| A class/function that creates the part-of-speech tagger.
| A class/function that creates the dependency parser.
| A class/function that creates the named entity recogniser.
| A boolean value to control whether the word vectors are loaded.
+method("__call__", "text, tag=True, parse=True, entity=True")(open)
+param("text", unicode_type)
| The text to be processed. No pre-processing needs to be applied,
| and any length of text can be submitted. Usually you will submit
| a whole document. Text may be zero-length. An exception is raised
| if byte strings are supplied.
+param("tag", bool_type)
| Whether to apply the part-of-speech tagger. Required for parsing
| and entity recognition.
+param("parse", bool_type)
| Whether to apply the syntactic dependency parser.
+param("entity", bool_type)
| Whether to apply the named entity recognizer.
| from spacy.en import English
| nlp = English()
| doc = nlp(u'Some text.) # Applies tagger, parser, entity
| doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
| doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
| doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
| doc = nlp(u'') # Zero-length tokens, not an error
| # doc = nlp(b'Some text') <-- Error: need unicode
| doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
+method("__init__", "vocab")
+param("vocab", vocab_type)
| A vocabulary object
+method("__getitem__", "i", int_type)
+method("__getitem__", "start_end", slice_type)
| Iterate over tokens
| Number of tokens in the document.
+attribute("sents", generator_type)
| Iterate over sentences in the document.
+attribute("ents", generator_type)
| Iterate over named entities in the document.
+attribute("noun_chunks", generator_type)
+method("to_array", "attr_ids")
| Given a list of M attribute IDs, export the tokens to a numpy ndarray
| of shape N*M, where N is the length of the sentence.
+param("attr_ids", "list[int]")
| A list of attribute ID ints.
| A feature matrix, with one row per word, and one column per attribute
| indicated in the input attr_ids.
+method("count_by", "attr_id")
| Produce a dict of {attribute (int): count (ints)} frequencies, keyed
| by the values of the given attribute ID.
| >>> from spacy.en import English, attrs
| >>> nlp = English()
| >>> tokens = nlp(u'apple apple orange banana')
| >>> tokens.count_by(attrs.ORTH)
| {12800L: 1, 11880L: 2, 7561L: 1}
| >>> tokens.to_array([attrs.ORTH])
| array([[11880],
| [11880],
| [7561],
| [12800]])
+method("from_array", "attrs, array")
| Load from array
| Serialize
| Deserialize, loading from bytes
| classmethod
+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
| Merge a multi-word expression into a single token. Currently
| experimental; API is likely to change.
+method("__init__", "vocab, doc, offset")
+param("vocab", Vocab_type)
p A Vocab object
+param("doc", Doc_type)
p The parent sequence
+param("offset", Int_type)
p The index of the token within the document
summary: h4 String Views
+attribute("orth / orth_")
| The form of the word with no string normalization or processing, as
| it appears in the string, without trailing whitespace.
+attribute("lemma / lemma_")
| The "base" of the word, with no inflectional suffixes, e.g. the lemma of
| "developing" is "develop", the lemma of "geese" is "goose", etc. Note that
| <em>derivational</em> suffixes are not stripped, e.g. the lemma of
| "instutitions" is "institution", not "institute". Lemmatization is
| performed using the WordNet data, but extended to also cover closed-class
| words such as pronouns. By default, the WN lemmatizer returns "hi"
| as the lemma of "his". We assign pronouns the lemma -PRON-.
+attribute("lower / lower_")
| The form of the word, but forced to lower-case, i.e.
pre.language-python: code lower = word.orth\_.lower()
//+attribute("norm / norm_")
// | The form of the word, after language-specific normalizations has been
// | applied.
+attribute("shape / shape_")
| A transform of the word's string, to show orthographic features.
| The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped
| to d. After these mappings, sequences of 4 or more of the same character
| are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
| :) --> :)
+attribute("prefix / prefix_")
| A length-N substring from the start of the word. Length may vary by
| language; currently for English n=1, i.e.
pre.language-python: code prefix = word.orth\_[:1]
+attribute("suffix / suffix_")
| A length-N substring from the end of the word. Length may vary by
| language; currently for English n=3, i.e.
pre.language-python: code suffix = word.orth\_[-3:]
// | lex_id
summary: h4 Alignment and Output
p Start index of the token in the string
+method("__len__", "")
p Length of the token's orth string, in unicode code-points.
+method("__unicode__", "")
p Same as token.orth_
+method("__str__", "")
p Varies between Python 2 and Python 3
| The form of the word as it appears in the string, <strong>including
| trailing whitespace</strong>. This is useful when you need to use
| linguistic features to add inline mark-up to the string.
+method("nbor, i=1")
p Offset relative to token
summary: h4 Distributional Features
| A "word embedding" representation: a dense real-valued vector that supports
| similarity queries between words. By default, spaCy currently loads
| vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
| model.
| The Brown cluster ID of the word. These are often useful features for
| linear models. If you're using a non-linear model, particularly a
| neural net or random forest, consider using the real-valued word
| representation vector, in Token.repvec, instead.
| The unigram log-probability of the word, estimated from counts from a
| large corpus, smoothed using Simple Good Turing estimation.
summary: h4 Syntactic Tags
+attribute("pos / pos_")
| A part-of-speech tag, from the Google Universal Tag Set, e.g.
| code>NOUN</code>, <code>VERB</code>, <code>ADV</code>. Constants for
| the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
+attribute("tag / tag_")
| A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
| <code>DT</code>, etc. These tags are language/corpus specific, and
| typically describe part-of-speech and some amount of morphological
| information. For instance, in the Penn Treebank tag set, <code>VBZ</code>
| is assigned to a present-tense singular verb.
+attribute("dep / dep_")
| The type of syntactic dependency relation between the word and its
| syntactic head.
summary: h4 Navigating the Parse Tree
| The Token that is the immediate syntactic head of the word. If the
| word is the root of the dependency tree, the same word is returned.
| An iterator for the immediate leftward syntactic children of the
| word.
| An iterator for the immediate rightward syntactic children of the
| word.
| The number of immediate syntactic children preceding the word in
| the string.
| The number of immediate syntactic children following the word in
| the string.
| An iterator that yields from lefts, and then yields from rights.
| An iterator for the part of the sentence syntactically governed by
| the word, including the word itself.
p The leftmost edge of the token's subtree
p The rightmost edge of the token's subtree
summary: h4 Named Entities
p If the token is part of an entity, its entity type.
p The IOB (inside, outside, begin) entity recognition tag for the token.
summary: h4 Lexeme Flags
+method("check_flag", "flag_id")
| flag ID
// | Conjuncts
p Get item
p Iter
p Len
p Syntactic head
p Tokens that are:
li To the left of the span;
li Syntactic children of words within the span
p i.e.
| lefts = [span.doc[i] for i in range(0, span.start)
| if span.doc[i].head in span]
p Tokens that are:
li To the right of the span;
li Syntactic children of words within the span
p i.e.
| rights = [span.doc[i] for i in range(span.end, len(span.doc))
| if span.doc[i].head in span]
p String
+attribute("lemma / lemma_")
p String
+attribute("label / label_")
p String
p String
+declare_class("spacy.vocab.Vocab", "data_dir=None, lex_props_getter=None")
p Number of words in the vocabulary.
+method("__getitem__", "key_int")
p Integer ID
+returns: p A Lexeme object
+method("__getitem__", "key_str")
+param("key_str", unicode_type)
p A string in the vocabulary
+method("__setitem__", "orth_str", "props")
+param("orth_str", unicode_type)
p The orth key
+param("props", dict_type)
p A props dictionary
+method("dump", "loc")
+param("loc", unicode_type)
p Path where the vocabulary should be saved
+method("load_lexemes", "loc")
+param("loc", unicode_type)
p Path to load the lexemes.bin file from
+method("load_vectors", "loc")
+param("loc", unicode_type)
p Path to load the vectors.bin from
p Number of strings in the string-store
+method("__getitem__", "key_int")
p An integer key
p The string that the integer key maps to
+method("__getitem__", "key_unicode")
p A key, as a unicode string
p The integer ID of the string.
+method("__getitem__", "key_utf8_bytes")
+param("key_utf8_bytes", bytes_type)
p p A key, as a UTF-8 encoded byte-string
p The integer ID of the string.
+method("dump", "loc")
p File path to save the strings.txt to.
p File path to load the strings.txt from.