mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-24 07:30:52 +03:00
Merge branch 'gaz' of https://github.com/honnibal/spaCy into gaz
This commit is contained in:
commit
4f765eee79
705
docs/redesign/docs.jade
Normal file
705
docs/redesign/docs.jade
Normal file
|
@ -0,0 +1,705 @@
|
|||
- var py_docs = '<a class="reference" href="http://docs.python.org/library/'
|
||||
|
||||
-
|
||||
var types = {
|
||||
'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
|
||||
'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
|
||||
'int': py_docs + 'functions.html#int"><em>int</em></a>',
|
||||
'generator': "",
|
||||
'Vocab': "",
|
||||
'Span': "",
|
||||
'Doc': ""
|
||||
}
|
||||
|
||||
|
||||
mixin declare_class(name)
|
||||
details
|
||||
summary
|
||||
span.declaration
|
||||
span.label class
|
||||
code #{name}
|
||||
block
|
||||
|
||||
mixin method(name, parameters)
|
||||
details(open=attributes.open)
|
||||
summary
|
||||
span.declaration
|
||||
span.label #{name}
|
||||
span.parameters
|
||||
| self, #{parameters}
|
||||
block
|
||||
|
||||
|
||||
mixin params
|
||||
ul
|
||||
block
|
||||
|
||||
|
||||
mixin param(name, type, value)
|
||||
li
|
||||
if type
|
||||
<strong>#{name}</strong> (!{type}) –
|
||||
else
|
||||
<strong>#{name}</strong> –
|
||||
block
|
||||
|
||||
|
||||
mixin attribute(name, type, value)
|
||||
details(open=attributes.open)
|
||||
summary
|
||||
span.declaration
|
||||
span.label #{name}
|
||||
block
|
||||
|
||||
|
||||
mixin returns(name, type, value)
|
||||
li
|
||||
if type
|
||||
<strong>#{name}</strong> (!{type}) –
|
||||
else
|
||||
<strong>#{name}</strong> –
|
||||
block
|
||||
|
||||
|
||||
mixin returns(type)
|
||||
| tmp
|
||||
|
||||
mixin init
|
||||
details
|
||||
summary: h4 Init
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin callable
|
||||
details
|
||||
summary: h4 Callable
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin sequence
|
||||
details
|
||||
summary: h4 Sequence
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin maptype
|
||||
details
|
||||
summary: h4 Map
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin summary
|
||||
block
|
||||
|
||||
mixin en_example
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| from spacy._doc_examples import download_war_and_peace
|
||||
|
|
||||
| unprocessed_unicode = download_war_and_peace()
|
||||
|
|
||||
| nlp = English()
|
||||
| doc = nlp(unprocessed_unicode)
|
||||
|
||||
|
||||
doctype html
|
||||
html(lang="en")
|
||||
head
|
||||
meta(charset="utf-8")
|
||||
title spaCy – Industrial-strength NLP
|
||||
meta(name="description" content="")
|
||||
meta(name="author" content="Matthew Honnibal")
|
||||
link(rel="stylesheet" href="css/style.css")
|
||||
<!--[if lt IE 9]>
|
||||
script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js")
|
||||
<![endif]-->
|
||||
|
||||
body(id="docs")
|
||||
header(role="banner")
|
||||
h1.logo spaCy – Industrial-strength NLP
|
||||
div.slogan API
|
||||
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="#") Home
|
||||
li.active: a(href="#") Docs
|
||||
li: a(href="#") License
|
||||
li: a(href="#") Blog
|
||||
|
||||
main.docs#content
|
||||
|
||||
article
|
||||
+declare_class("English")
|
||||
p Load models into a callable object to process English text.
|
||||
|
||||
+summary
|
||||
+en_example
|
||||
|
||||
+init
|
||||
p
|
||||
| Load the resources. Loading takes 20 seconds, and the instance
|
||||
| consumes 2 to 3 gigabytes of memory.
|
||||
|
||||
p
|
||||
| Intended use is for one instance to be created per process.
|
||||
| You can create more if you're doing something unusual.
|
||||
p
|
||||
| You may wish to make the instance a global variable or "singleton".
|
||||
| We usually instantiate the object in the <code>main()</code>
|
||||
| function and pass it around as an explicit argument.
|
||||
+method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
|
||||
|
||||
+params
|
||||
+param("data_dir")
|
||||
| The data directory. May be #{None}, to disable any data loading
|
||||
| (including the vocabulary).
|
||||
|
||||
+param("Tokenizer")
|
||||
| A class/function that creates the tokenizer.
|
||||
|
||||
+param("Tagger")
|
||||
| A class/function that creates the part-of-speech tagger.
|
||||
|
||||
+param("Parser")
|
||||
| A class/function that creates the dependency parser.
|
||||
|
||||
+param("Entity")
|
||||
| A class/function that creates the named entity recogniser.
|
||||
|
||||
+param("load_vectors")
|
||||
| A boolean value to control whether the word vectors are loaded.
|
||||
|
||||
+callable
|
||||
+method("__call__", "text, tag=True, parse=True, entity=True")
|
||||
|
||||
+params
|
||||
+param("text", types.unicode)
|
||||
| The text to be processed. No pre-processing needs to be applied,
|
||||
| and any length of text can be submitted. Usually you will submit
|
||||
| a whole document. Text may be zero-length. An exception is raised
|
||||
| if byte strings are supplied.
|
||||
|
||||
+param("tag", bool_type)
|
||||
| Whether to apply the part-of-speech tagger. Required for parsing
|
||||
| and entity recognition.
|
||||
|
||||
+param("parse", bool_type)
|
||||
| Whether to apply the syntactic dependency parser.
|
||||
|
||||
+param("entity", bool_type)
|
||||
| Whether to apply the named entity recognizer.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| nlp = English()
|
||||
| doc = nlp(u'Some text.) # Applies tagger, parser, entity
|
||||
| doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
|
||||
| doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
|
||||
| doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
|
||||
| doc = nlp(u'') # Zero-length tokens, not an error
|
||||
| # doc = nlp(b'Some text') <-- Error: need unicode
|
||||
| doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
||||
|
||||
|
||||
+declare_class("Doc")
|
||||
p I'm a doc
|
||||
|
||||
+init
|
||||
+method("__init__", "vocab")
|
||||
+params
|
||||
+param("vocab", vocab_type)
|
||||
| A vocabulary object
|
||||
|
||||
+sequence
|
||||
+method("__getitem__", "i", types.int)
|
||||
+returns(types.Token)
|
||||
|
||||
+method("__getitem__", "start_end", types.slice)
|
||||
+returns(types.Span)
|
||||
|
||||
+method("__iter__")
|
||||
| Iterate over tokens
|
||||
|
||||
+method("__len__")
|
||||
| Number of tokens in the document.
|
||||
|
||||
details
|
||||
summary: h4 Spans
|
||||
|
||||
+attribute("sents", types.generator)
|
||||
| Iterate over sentences in the document.
|
||||
|
||||
+attribute("ents", types.generator)
|
||||
| Iterate over named entities in the document.
|
||||
|
||||
+attribute("noun_chunks", types.generator)
|
||||
|
||||
details
|
||||
summary: h4 Export/Import
|
||||
|
||||
+method("to_array", "attr_ids")
|
||||
|
||||
| Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
||||
| of shape N*M, where N is the length of the sentence.
|
||||
|
||||
+params
|
||||
+param("attr_ids", "list[int]")
|
||||
| A list of attribute ID ints.
|
||||
|
||||
+returns("feat_array")
|
||||
| A feature matrix, with one row per word, and one column per attribute
|
||||
| indicated in the input attr_ids.
|
||||
|
||||
+method("count_by", "attr_id")
|
||||
| Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||
| by the values of the given attribute ID.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> from spacy.en import English, attrs
|
||||
| >>> nlp = English()
|
||||
| >>> tokens = nlp(u'apple apple orange banana')
|
||||
| >>> tokens.count_by(attrs.ORTH)
|
||||
| {12800L: 1, 11880L: 2, 7561L: 1}
|
||||
| >>> tokens.to_array([attrs.ORTH])
|
||||
| array([[11880],
|
||||
| [11880],
|
||||
| [7561],
|
||||
| [12800]])
|
||||
|
||||
+method("from_array", "attrs, array")
|
||||
| Load from array
|
||||
|
||||
+method("from_bytes")
|
||||
| Deserialize, loading from bytes
|
||||
|
||||
+method("read_bytes")
|
||||
| classmethod
|
||||
|
||||
//+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
|
||||
|
||||
// | Merge a multi-word expression into a single token. Currently
|
||||
// | experimental; API is likely to change.
|
||||
|
||||
|
||||
+declare_class("Token")
|
||||
+init
|
||||
+method("__init__", "vocab, doc, offset")
|
||||
+params
|
||||
+param("vocab", types.Vocab)
|
||||
p A Vocab object
|
||||
|
||||
+param("doc", types.Doc)
|
||||
p The parent sequence
|
||||
|
||||
+param("offset", types.int)
|
||||
p The index of the token within the document
|
||||
|
||||
details
|
||||
summary: h4 String Views
|
||||
|
||||
+attribute("orth / orth_")
|
||||
| The form of the word with no string normalization or processing, as
|
||||
| it appears in the string, without trailing whitespace.
|
||||
|
||||
+attribute("lemma / lemma_")
|
||||
| The "base" of the word, with no inflectional suffixes, e.g. the lemma of
|
||||
| "developing" is "develop", the lemma of "geese" is "goose", etc. Note that
|
||||
| <em>derivational</em> suffixes are not stripped, e.g. the lemma of
|
||||
| "instutitions" is "institution", not "institute". Lemmatization is
|
||||
| performed using the WordNet data, but extended to also cover closed-class
|
||||
| words such as pronouns. By default, the WN lemmatizer returns "hi"
|
||||
| as the lemma of "his". We assign pronouns the lemma -PRON-.
|
||||
|
||||
+attribute("lower / lower_")
|
||||
| The form of the word, but forced to lower-case, i.e.
|
||||
pre.language-python: code lower = word.orth\_.lower()
|
||||
|
||||
//+attribute("norm / norm_")
|
||||
// | The form of the word, after language-specific normalizations has been
|
||||
// | applied.
|
||||
|
||||
+attribute("shape / shape_")
|
||||
| A transform of the word's string, to show orthographic features.
|
||||
| The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped
|
||||
| to d. After these mappings, sequences of 4 or more of the same character
|
||||
| are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
|
||||
| :) --> :)
|
||||
|
||||
+attribute("prefix / prefix_")
|
||||
| A length-N substring from the start of the word. Length may vary by
|
||||
| language; currently for English n=1, i.e.
|
||||
pre.language-python: code prefix = word.orth\_[:1]
|
||||
|
||||
+attribute("suffix / suffix_")
|
||||
| A length-N substring from the end of the word. Length may vary by
|
||||
| language; currently for English n=3, i.e.
|
||||
pre.language-python: code suffix = word.orth\_[-3:]
|
||||
|
||||
//+attribute("lex_id")
|
||||
// | lex_id
|
||||
|
||||
details
|
||||
summary: h4 Alignment and Output
|
||||
|
||||
+attribute("idx")
|
||||
p Start index of the token in the string
|
||||
|
||||
+method("__len__", "")
|
||||
p Length of the token's orth string, in unicode code-points.
|
||||
|
||||
+method("__unicode__", "")
|
||||
p Same as token.orth_
|
||||
|
||||
+method("__str__", "")
|
||||
p Varies between Python 2 and Python 3
|
||||
|
||||
+attribute("string")
|
||||
p
|
||||
| The form of the word as it appears in the string, <strong>including
|
||||
| trailing whitespace</strong>. This is useful when you need to use
|
||||
| linguistic features to add inline mark-up to the string.
|
||||
|
||||
+method("nbor, i=1")
|
||||
+params
|
||||
+param("i")
|
||||
p Offset relative to token
|
||||
|
||||
details
|
||||
summary: h4 Distributional Features
|
||||
|
||||
+attribute("repvec")
|
||||
p
|
||||
| A "word embedding" representation: a dense real-valued vector that supports
|
||||
| similarity queries between words. By default, spaCy currently loads
|
||||
| vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
|
||||
| model.
|
||||
|
||||
+attribute("cluster")
|
||||
p
|
||||
| The Brown cluster ID of the word. These are often useful features for
|
||||
| linear models. If you're using a non-linear model, particularly a
|
||||
| neural net or random forest, consider using the real-valued word
|
||||
| representation vector, in Token.repvec, instead.
|
||||
|
||||
+attribute("prob")
|
||||
p
|
||||
| The unigram log-probability of the word, estimated from counts from a
|
||||
| large corpus, smoothed using Simple Good Turing estimation.
|
||||
|
||||
details
|
||||
summary: h4 Syntactic Tags
|
||||
|
||||
+attribute("pos / pos_")
|
||||
p
|
||||
| A part-of-speech tag, from the Google Universal Tag Set, e.g.
|
||||
| code>NOUN</code>, <code>VERB</code>, <code>ADV</code>. Constants for
|
||||
| the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
|
||||
|
||||
+attribute("tag / tag_")
|
||||
p
|
||||
| A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
|
||||
| <code>DT</code>, etc. These tags are language/corpus specific, and
|
||||
| typically describe part-of-speech and some amount of morphological
|
||||
| information. For instance, in the Penn Treebank tag set, <code>VBZ</code>
|
||||
| is assigned to a present-tense singular verb.
|
||||
|
||||
+attribute("dep / dep_")
|
||||
p
|
||||
| The type of syntactic dependency relation between the word and its
|
||||
| syntactic head.
|
||||
|
||||
details
|
||||
summary: h4 Navigating the Parse Tree
|
||||
|
||||
+attribute("head")
|
||||
p
|
||||
| The Token that is the immediate syntactic head of the word. If the
|
||||
| word is the root of the dependency tree, the same word is returned.
|
||||
|
||||
+attribute("lefts")
|
||||
p
|
||||
| An iterator for the immediate leftward syntactic children of the
|
||||
| word.
|
||||
|
||||
+attribute("rights")
|
||||
p
|
||||
| An iterator for the immediate rightward syntactic children of the
|
||||
| word.
|
||||
|
||||
+attribute("n_lefts")
|
||||
p
|
||||
| The number of immediate syntactic children preceding the word in
|
||||
| the string.
|
||||
|
||||
+attribute("n_rights")
|
||||
p
|
||||
| The number of immediate syntactic children following the word in
|
||||
| the string.
|
||||
|
||||
+attribute("children")
|
||||
p
|
||||
| An iterator that yields from lefts, and then yields from rights.
|
||||
|
||||
+attribute("subtree")
|
||||
p
|
||||
| An iterator for the part of the sentence syntactically governed by
|
||||
| the word, including the word itself.
|
||||
|
||||
+attribute("left_edge")
|
||||
p The leftmost edge of the token's subtree
|
||||
|
||||
+attribute("right_edge")
|
||||
p The rightmost edge of the token's subtree
|
||||
|
||||
details
|
||||
summary: h4 Named Entities
|
||||
|
||||
+attribute("ent_type")
|
||||
p If the token is part of an entity, its entity type.
|
||||
|
||||
+attribute("ent_iob")
|
||||
p The IOB (inside, outside, begin) entity recognition tag for the token.
|
||||
|
||||
details
|
||||
summary: h4 Lexeme Flags
|
||||
|
||||
+method("check_flag", "flag_id")
|
||||
+params
|
||||
+param("flag_id")
|
||||
| flag ID
|
||||
|
||||
+attribute("is_oov")
|
||||
+attribute("is_alpha")
|
||||
+attribute("is_ascii")
|
||||
+attribute("is_digit")
|
||||
+attribute("is_lower")
|
||||
+attribute("is_title")
|
||||
+attribute("is_punct")
|
||||
+attribute("is_space")
|
||||
+attribute("like_url")
|
||||
+attribute("like_num")
|
||||
+attribute("like_email")
|
||||
|
||||
//+attribute("conjuncts")
|
||||
// | Conjuncts
|
||||
|
||||
+declare_class("Span")
|
||||
+init
|
||||
+method("__init__")
|
||||
Temp
|
||||
|
||||
<code>span = doc[0:4]</code>
|
||||
|
||||
+sequence
|
||||
+method("__getitem__")
|
||||
p Get item
|
||||
|
||||
+method("__iter__")
|
||||
p Iter
|
||||
|
||||
+method("__len__")
|
||||
p Len
|
||||
|
||||
details
|
||||
summary: h4 Parse
|
||||
|
||||
+attribute("root")
|
||||
p Syntactic head
|
||||
|
||||
+attribute("lefts")
|
||||
p Tokens that are:
|
||||
ol
|
||||
li To the left of the span;
|
||||
li Syntactic children of words within the span
|
||||
|
||||
p i.e.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| lefts = [span.doc[i] for i in range(0, span.start)
|
||||
| if span.doc[i].head in span]
|
||||
|
||||
+attribute("rights")
|
||||
p Tokens that are:
|
||||
ol
|
||||
li To the right of the span;
|
||||
li Syntactic children of words within the span
|
||||
p i.e.
|
||||
pre.language-python
|
||||
code
|
||||
| rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
||||
| if span.doc[i].head in span]
|
||||
|
||||
|
||||
+attribute("subtree")
|
||||
p String
|
||||
|
||||
details
|
||||
summary: h4 String Views
|
||||
|
||||
+attribute("string")
|
||||
p String
|
||||
|
||||
+attribute("lemma / lemma_")
|
||||
p String
|
||||
|
||||
+attribute("label / label_")
|
||||
p String
|
||||
|
||||
+declare_class("Lexeme")
|
||||
p
|
||||
| The Lexeme object represents a lexical type, stored in the vocabulary
|
||||
| – as opposed to a token, occurring in a document.
|
||||
p
|
||||
| Lexemes store various features, so that these features can be computed
|
||||
| once per type, rather than once per token. As job sizes grow, this
|
||||
| can amount to a substantial efficiency improvement.
|
||||
|
||||
p
|
||||
| All Lexeme attributes are therefore context independent, as a single
|
||||
| lexeme is reused for all usages of that word. Lexemes are keyed by
|
||||
| the “orth” attribute.
|
||||
|
||||
p
|
||||
All Lexeme attributes are accessible directly on the Token object.
|
||||
|
||||
+init
|
||||
+method("__init__")
|
||||
p Init
|
||||
|
||||
details
|
||||
summary: h4 String Features
|
||||
|
||||
+attribute("orth / orth_")
|
||||
p
|
||||
| The form of the word with no string normalization or processing,
|
||||
| as it appears in the string, without trailing whitespace.
|
||||
|
||||
+attribute("lower / lower_")
|
||||
p Tmp
|
||||
|
||||
+attribute("norm / norm_")
|
||||
p Tmp
|
||||
|
||||
+attribute("shape / shape_")
|
||||
p Tmp
|
||||
|
||||
+attribute("prefix / prefix_")
|
||||
p Tmp
|
||||
|
||||
+attribute("suffix / suffix_")
|
||||
p TMP
|
||||
|
||||
+declare_class("Vocab", "data_dir=None, lex_props_getter=None")
|
||||
+sequence
|
||||
+method("__len__")
|
||||
+returns
|
||||
p Number of words in the vocabulary.
|
||||
|
||||
+method("__iter__")
|
||||
+returns
|
||||
p Lexeme
|
||||
|
||||
+maptype
|
||||
+method("__getitem__", "key_int")
|
||||
+params
|
||||
+param("key")
|
||||
p Integer ID
|
||||
|
||||
+returns: p A Lexeme object
|
||||
|
||||
+method("__getitem__", "key_str")
|
||||
+params
|
||||
+param("key_str", types.unicode)
|
||||
p A string in the vocabulary
|
||||
|
||||
+returns("Lexeme")
|
||||
|
||||
+method("__setitem__", "orth_str", "props")
|
||||
+params
|
||||
+param("orth_str", types.unicode)
|
||||
p The orth key
|
||||
|
||||
+param("props", types.dict)
|
||||
p A props dictionary
|
||||
|
||||
+returns("None")
|
||||
|
||||
details
|
||||
summary: h4 Import/Export
|
||||
|
||||
+method("dump", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path where the vocabulary should be saved
|
||||
|
||||
+method("load_lexemes", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path to load the lexemes.bin file from
|
||||
|
||||
+method("load_vectors", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path to load the vectors.bin from
|
||||
|
||||
+declare_class("StringStore")
|
||||
+init
|
||||
Tmp
|
||||
|
||||
+sequence
|
||||
+method("__len__")
|
||||
+returns("int")
|
||||
p Number of strings in the string-store
|
||||
|
||||
+method("__iter__")
|
||||
+returns
|
||||
p Lexeme
|
||||
|
||||
+maptype
|
||||
+method("__getitem__", "key_int")
|
||||
+params
|
||||
+param("key_int")
|
||||
p An integer key
|
||||
|
||||
+returns(types.unicode)
|
||||
p The string that the integer key maps to
|
||||
|
||||
+method("__getitem__", "key_unicode")
|
||||
+params
|
||||
+param("key_unicode")
|
||||
p A key, as a unicode string
|
||||
|
||||
+returns(types.int)
|
||||
p The integer ID of the string.
|
||||
|
||||
+method("__getitem__", "key_utf8_bytes")
|
||||
+params
|
||||
+param("key_utf8_bytes", types.bytes)
|
||||
p p A key, as a UTF-8 encoded byte-string
|
||||
|
||||
+returns(types.int)
|
||||
p The integer ID of the string.
|
||||
|
||||
details
|
||||
summary: h4 Import/Export
|
||||
|
||||
+method("dump", "loc")
|
||||
+params
|
||||
+param("loc")
|
||||
p File path to save the strings.txt to.
|
||||
|
||||
+method("load")
|
||||
+params
|
||||
+param("loc")
|
||||
p File path to load the strings.txt from.
|
||||
|
||||
script(src="js/prism.js")
|
106
docs/redesign/home.jade
Normal file
106
docs/redesign/home.jade
Normal file
|
@ -0,0 +1,106 @@
|
|||
extends ./outline.jade
|
||||
|
||||
// Notes
|
||||
//
|
||||
// 1. Where to put version notice? Should say something like
|
||||
// 2015-08-12: v0.89
|
||||
// and be a link
|
||||
//
|
||||
// Only needs to appear on home page.
|
||||
|
||||
|
||||
- var slogan = "Build Tomorrow's Language Technologies"
|
||||
- var tag_line = "spaCy – " + slogan
|
||||
|
||||
mixin lede
|
||||
- var state_of_the_art = '<a href="#">state-of-the-art</a>'
|
||||
- var a_minor_miracle = '<a href="">a minor miracle</a>'
|
||||
- var great_documentation = '<a href="">great documentation</a>'
|
||||
|
||||
p.
|
||||
<a href="https://github.com/honnibal/spaCy"><strong>spaCy</strong></a> is a
|
||||
library for industrial-strength NLP in Python and Cython. It features
|
||||
!{state_of_the_art} speed and accuracy, a concise API, and great documentation.
|
||||
If you're a small company doing NLP, we want <strong>spaCy</strong> to seem
|
||||
like !{a_minor_miracle}.
|
||||
|
||||
mixin overview()
|
||||
p.
|
||||
Overview text
|
||||
|
||||
mixin benchmarks()
|
||||
p.
|
||||
Benchmarks
|
||||
|
||||
mixin get_started()
|
||||
p.
|
||||
Get Started
|
||||
|
||||
|
||||
mixin comparison(name)
|
||||
details
|
||||
summary
|
||||
h4= name
|
||||
|
||||
block
|
||||
|
||||
mixin columns(...names)
|
||||
tr
|
||||
each name in names
|
||||
th= name
|
||||
|
||||
|
||||
mixin row(...cells)
|
||||
tr
|
||||
each cell in cells
|
||||
td= cell
|
||||
|
||||
|
||||
mixin social
|
||||
footer(role="contentinfo")
|
||||
a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter
|
||||
|
||||
div.discuss
|
||||
a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn")
|
||||
| Discuss on Hacker News
|
||||
|
||||
a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit")
|
||||
| Discuss on Reddit
|
||||
|
||||
|
||||
mixin Section(title_text, link_name, include_file)
|
||||
a(name=link_name): h3 #{title_text}
|
||||
|
||||
if (link_name == "example-use")
|
||||
include ./usage_examples.jade
|
||||
else if (link_name == "online-demo")
|
||||
include ./online_demo.jade
|
||||
else if (link_name == "comparisons")
|
||||
include ./comparisons.jade
|
||||
else if (link_name == "install")
|
||||
include ./installation.jade
|
||||
|
||||
|
||||
block intro_block
|
||||
section(class="intro")
|
||||
+lede
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="#example-use" class="button") Examples
|
||||
li: a(href="#online-demo" class="button") Demo
|
||||
li: a(href="#comparisons" class="button") Comparisons
|
||||
li: a(href="#install" class="button") Install v0.89
|
||||
|
||||
|
||||
block body_block
|
||||
article(class="page landing-page")
|
||||
|
||||
+Section("Usage by Example", "example-use", "./usage_examples.jade")
|
||||
|
||||
+Section("Online Demo", "online-demo", "./online_demo.jade")
|
||||
|
||||
+Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade")
|
||||
|
||||
+Section("Install", "install", "./install.jade")
|
||||
|
40
docs/redesign/installation.jade
Normal file
40
docs/redesign/installation.jade
Normal file
|
@ -0,0 +1,40 @@
|
|||
p With Python 2.7 or Python 3, using Linux or OSX, run:
|
||||
|
||||
pre.language-bash: code
|
||||
| $ pip install spacy
|
||||
| $ python -m spacy.en.download</code></pre>
|
||||
|
||||
p
|
||||
| The download command fetches and installs about 300mb of data, for
|
||||
| the parser model and word vectors, which it installs within the spacy.en
|
||||
| package directory.
|
||||
|
||||
p
|
||||
| If you're stuck using a server with an old version of Python, and you
|
||||
| don't have root access, I've prepared a bootstrap script to help you
|
||||
| compile a local Python install. Run:
|
||||
|
||||
pre.language-bash: code
|
||||
| $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
|
||||
|
||||
p
|
||||
| The other way to install the package is to clone the github repository,
|
||||
| and build it from source. This installs an additional dependency,
|
||||
| Cython. If you're using Python 2, I also recommend installing fabric
|
||||
| and fabtools – this is how I build the project.
|
||||
|
||||
pre.language-bash: code
|
||||
| $ git clone https://github.com/honnibal/spaCy.git
|
||||
| $ cd spaCy
|
||||
| $ virtualenv .env && source .env/bin/activate
|
||||
| $ export PYTHONPATH=`pwd`
|
||||
| $ pip install -r requirements.txt
|
||||
| $ python setup.py build_ext --inplace
|
||||
| $ python -m spacy.en.download
|
||||
| $ pip install pytest
|
||||
| $ py.test tests/
|
||||
|
||||
p
|
||||
| Python packaging is awkward at the best of times, and it's particularly tricky
|
||||
| with C extensions, built via Cython, requiring large data files. So,
|
||||
| please report issues as you encounter them.
|
0
docs/redesign/online_demo.jade
Normal file
0
docs/redesign/online_demo.jade
Normal file
37
docs/redesign/outline.jade
Normal file
37
docs/redesign/outline.jade
Normal file
|
@ -0,0 +1,37 @@
|
|||
- var slogan = "Build Tomorrow's Language Technologies"
|
||||
- var tag_line = "spaCy – " + slogan
|
||||
|
||||
|
||||
doctype html
|
||||
html(lang="en")
|
||||
head
|
||||
meta(charset="utf-8")
|
||||
title!= tag_line
|
||||
meta(name="description" content="")
|
||||
meta(name="author" content="Matthew Honnibal")
|
||||
link(rel="stylesheet" href="css/style.css")
|
||||
<!--[if lt IE 9]>
|
||||
script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js")
|
||||
<![endif]-->
|
||||
|
||||
body(id="home" role="document")
|
||||
header(role="banner")
|
||||
h1(class="logo")!= tag_line
|
||||
div(class="slogan")!= slogan
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="#") Home
|
||||
li: a(href="#") Docs
|
||||
li: a(href="#") License
|
||||
li: a(href="#") More
|
||||
|
||||
main(id="content" role="main")
|
||||
block intro_block
|
||||
|
||||
block body_block
|
||||
|
||||
footer(role="contentinfo")
|
||||
|
||||
script(src="js/prism.js")
|
||||
script(src="js/details_polyfill.js")
|
109
docs/redesign/usage_examples.jade
Normal file
109
docs/redesign/usage_examples.jade
Normal file
|
@ -0,0 +1,109 @@
|
|||
mixin example(name)
|
||||
details
|
||||
summary
|
||||
h4= name
|
||||
block
|
||||
|
||||
|
||||
+example("Load resources and process text")
|
||||
pre.language-python: code
|
||||
| from __future__ import unicode_literals, print_function
|
||||
| from spacy.en import English
|
||||
| nlp = English()
|
||||
| doc = nlp('Hello, world. Here are two sentences.')
|
||||
|
||||
+example("Get tokens and sentences")
|
||||
pre.language-python: code
|
||||
| token = doc[0]
|
||||
| sentence = doc.sents[0]
|
||||
| assert token[0] is sentence[0]
|
||||
|
||||
+example("Use integer IDs for any string")
|
||||
pre.language-python: code
|
||||
| hello_id = nlp.vocab.strings['Hello']
|
||||
| hello_str = nlp.vocab.strings[hello_id]
|
||||
|
|
||||
| assert token.orth == hello_id == 52
|
||||
| assert token.orth_ == hello_str == 'Hello'
|
||||
|
||||
+example("Get and set string views and flags")
|
||||
pre.language-python: code
|
||||
| assert token.shape_ == 'Xxxx'
|
||||
| for lexeme in nlp.vocab:
|
||||
| if lexeme.is_alpha:
|
||||
| lexeme.shape_ = 'W'
|
||||
| elif lexeme.is_digit:
|
||||
| lexeme.shape_ = 'D'
|
||||
| elif lexeme.is_punct:
|
||||
| lexeme.shape_ = 'P'
|
||||
| else:
|
||||
| lexeme.shape_ = 'M'
|
||||
| assert token.shape_ == 'W'
|
||||
|
||||
+example("Export to numpy arrays")
|
||||
pre.language-python: code
|
||||
| from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
|
||||
|
|
||||
| attr_ids = [ORTH, LIKE_URL, IS_OOV]
|
||||
| doc_array = doc.to_array(attr_ids)
|
||||
| assert doc_array.shape == (len(doc), len(attrs)
|
||||
| assert doc[0].orth == doc_array[0, 0]
|
||||
| assert doc[1].orth == doc_array[1, 0]
|
||||
| assert doc[0].like_url == doc_array[0, 1]
|
||||
| assert doc_array[, 1] == [t.like_url for t in doc]
|
||||
|
||||
+example("Word vectors")
|
||||
pre.language-python: code
|
||||
| doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
|
||||
|
|
||||
| apples = doc[0]
|
||||
| oranges = doc[1]
|
||||
| boots = doc[6]
|
||||
| hippos = doc[8]
|
||||
|
|
||||
| assert apples.similarity(oranges) > boots.similarity(hippos)
|
||||
|
||||
|
||||
+example("Part-of-speech tags")
|
||||
pre.language-python: code
|
||||
| doc[0].pos
|
||||
| doc[0].tag
|
||||
|
||||
+example("Syntactic dependencies")
|
||||
pre.language-python: code
|
||||
| for head in tokens:
|
||||
| for child in head.lefts:
|
||||
| assert child.head is head
|
||||
| for child in head.rights:
|
||||
| assert child.head is head
|
||||
| sent = nlp('The four wheels on the bus turned quickly.')
|
||||
| wheels = sent[2]
|
||||
| bus = sent[5]
|
||||
| assert len(list(wheels.lefts)) == 2
|
||||
| assert len(list(wheels.rights)) == 1
|
||||
| assert len(list(wheels.children)) == 3
|
||||
| assert len(list(bus.lefts)) == 1
|
||||
| assert len(list(bus.rights)) == 0
|
||||
| assert len(list(bus.children)) == 1
|
||||
|
|
||||
| assert len(list(wheels.subtree)) == 6
|
||||
|
||||
+example("Named entities")
|
||||
pre.language-python: code
|
||||
| doc.ents
|
||||
| token.ent_type
|
||||
| token.ent_iob
|
||||
|
||||
+example("Define custom NER rules")
|
||||
pre.language-python: code
|
||||
| nlp.matcher
|
||||
|
||||
+example("Calculate inline mark-up on original string")
|
||||
pre.language-python: code
|
||||
| token.string
|
||||
| token.spacy
|
||||
| token.whitespace_
|
||||
|
||||
+example("Efficient binary serialization")
|
||||
pre.language-python: code
|
||||
|
|
|
@ -14,8 +14,8 @@
|
|||
{"orth": "9/11"}
|
||||
],
|
||||
[
|
||||
{"lower": "Septmber"},
|
||||
{"lower": "Eleven"}
|
||||
{"lower": "septmber"},
|
||||
{"lower": "eleven"}
|
||||
],
|
||||
[
|
||||
{"lower": "september"},
|
||||
|
|
Loading…
Reference in New Issue
Block a user