mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 13:14:11 +03:00
* Work on website
This commit is contained in:
parent
5ee645d742
commit
c9b19a9c00
661
docs/redesign/api.jade
Normal file
661
docs/redesign/api.jade
Normal file
|
@ -0,0 +1,661 @@
|
|||
mixin declare_class(name)
|
||||
details
|
||||
summary
|
||||
span.declaration
|
||||
span.label class
|
||||
code #{name}
|
||||
block
|
||||
|
||||
mixin method(name, parameters)
|
||||
details(open=attributes.open)
|
||||
summary
|
||||
span.declaration
|
||||
span.label #{name}
|
||||
span.parameters
|
||||
| self, #{parameters}
|
||||
block
|
||||
|
||||
|
||||
mixin params
|
||||
ul
|
||||
block
|
||||
|
||||
|
||||
mixin param(name, type, value)
|
||||
li
|
||||
if type
|
||||
<strong>#{name}</strong> (!{type}) –
|
||||
else
|
||||
<strong>#{name}</strong> –
|
||||
block
|
||||
|
||||
|
||||
mixin attribute(name, type, value)
|
||||
details(open=attributes.open)
|
||||
summary
|
||||
span.declaration
|
||||
span.label #{name}
|
||||
block
|
||||
|
||||
|
||||
mixin returns(name, type, value)
|
||||
li
|
||||
if type
|
||||
<strong>#{name}</strong> (!{type}) –
|
||||
else
|
||||
<strong>#{name}</strong> –
|
||||
block
|
||||
|
||||
|
||||
mixin returns(type)
|
||||
| tmp
|
||||
|
||||
mixin init
|
||||
details
|
||||
summary: h4 Init
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin callable
|
||||
details
|
||||
summary: h4 Callable
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin sequence
|
||||
details
|
||||
summary: h4 Sequence
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin maptype
|
||||
details
|
||||
summary: h4 Map
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin summary
|
||||
block
|
||||
|
||||
mixin en_example
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| from spacy._doc_examples import download_war_and_peace
|
||||
|
|
||||
| unprocessed_unicode = download_war_and_peace()
|
||||
|
|
||||
| nlp = English()
|
||||
| doc = nlp(unprocessed_unicode)
|
||||
|
||||
|
||||
+declare_class("English")
|
||||
p Load models into a callable object to process English text.
|
||||
|
||||
+summary
|
||||
+en_example
|
||||
|
||||
+init
|
||||
p
|
||||
| Load the resources. Loading takes 20 seconds, and the instance
|
||||
| consumes 2 to 3 gigabytes of memory.
|
||||
|
||||
p
|
||||
| Intended use is for one instance to be created per process.
|
||||
| You can create more if you're doing something unusual.
|
||||
p
|
||||
| You may wish to make the instance a global variable or "singleton".
|
||||
| We usually instantiate the object in the <code>main()</code>
|
||||
| function and pass it around as an explicit argument.
|
||||
+method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
|
||||
|
||||
+params
|
||||
+param("data_dir")
|
||||
| The data directory. May be #{None}, to disable any data loading
|
||||
| (including the vocabulary).
|
||||
|
||||
+param("Tokenizer")
|
||||
| A class/function that creates the tokenizer.
|
||||
|
||||
+param("Tagger")
|
||||
| A class/function that creates the part-of-speech tagger.
|
||||
|
||||
+param("Parser")
|
||||
| A class/function that creates the dependency parser.
|
||||
|
||||
+param("Entity")
|
||||
| A class/function that creates the named entity recogniser.
|
||||
|
||||
+param("load_vectors")
|
||||
| A boolean value to control whether the word vectors are loaded.
|
||||
|
||||
+callable
|
||||
+method("__call__", "text, tag=True, parse=True, entity=True")
|
||||
|
||||
+params
|
||||
+param("text", types.unicode)
|
||||
| The text to be processed. No pre-processing needs to be applied,
|
||||
| and any length of text can be submitted. Usually you will submit
|
||||
| a whole document. Text may be zero-length. An exception is raised
|
||||
| if byte strings are supplied.
|
||||
|
||||
+param("tag", types.bool)
|
||||
| Whether to apply the part-of-speech tagger. Required for parsing
|
||||
| and entity recognition.
|
||||
|
||||
+param("parse", types.bool)
|
||||
| Whether to apply the syntactic dependency parser.
|
||||
|
||||
+param("entity", types.bool)
|
||||
| Whether to apply the named entity recognizer.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| nlp = English()
|
||||
| doc = nlp(u'Some text.) # Applies tagger, parser, entity
|
||||
| doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
|
||||
| doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
|
||||
| doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
|
||||
| doc = nlp(u'') # Zero-length tokens, not an error
|
||||
| # doc = nlp(b'Some text') <-- Error: need unicode
|
||||
| doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
||||
|
||||
|
||||
+declare_class("Doc")
|
||||
p I'm a doc
|
||||
|
||||
+init
|
||||
+method("__init__", "vocab")
|
||||
+params
|
||||
+param("vocab", vocab_type)
|
||||
| A vocabulary object
|
||||
|
||||
+sequence
|
||||
+method("__getitem__", "i", types.int)
|
||||
+returns(types.Token)
|
||||
|
||||
+method("__getitem__", "start_end", types.slice)
|
||||
+returns(types.Span)
|
||||
|
||||
+method("__iter__")
|
||||
| Iterate over tokens
|
||||
|
||||
+method("__len__")
|
||||
| Number of tokens in the document.
|
||||
|
||||
details
|
||||
summary: h4 Spans
|
||||
|
||||
+attribute("sents", types.generator)
|
||||
| Iterate over sentences in the document.
|
||||
|
||||
+attribute("ents", types.generator)
|
||||
| Iterate over named entities in the document.
|
||||
|
||||
+attribute("noun_chunks", types.generator)
|
||||
|
||||
details
|
||||
summary: h4 Export/Import
|
||||
|
||||
+method("to_array", "attr_ids")
|
||||
|
||||
| Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
||||
| of shape N*M, where N is the length of the sentence.
|
||||
|
||||
+params
|
||||
+param("attr_ids", "list[int]")
|
||||
| A list of attribute ID ints.
|
||||
|
||||
+returns("feat_array")
|
||||
| A feature matrix, with one row per word, and one column per attribute
|
||||
| indicated in the input attr_ids.
|
||||
|
||||
+method("count_by", "attr_id")
|
||||
| Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||
| by the values of the given attribute ID.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> from spacy.en import English, attrs
|
||||
| >>> nlp = English()
|
||||
| >>> tokens = nlp(u'apple apple orange banana')
|
||||
| >>> tokens.count_by(attrs.ORTH)
|
||||
| {12800L: 1, 11880L: 2, 7561L: 1}
|
||||
| >>> tokens.to_array([attrs.ORTH])
|
||||
| array([[11880],
|
||||
| [11880],
|
||||
| [7561],
|
||||
| [12800]])
|
||||
|
||||
+method("from_array", "attrs, array")
|
||||
| Load from array
|
||||
|
||||
+method("from_bytes")
|
||||
| Deserialize, loading from bytes
|
||||
|
||||
+method("read_bytes")
|
||||
| classmethod
|
||||
|
||||
//+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
|
||||
|
||||
// | Merge a multi-word expression into a single token. Currently
|
||||
// | experimental; API is likely to change.
|
||||
|
||||
|
||||
+declare_class("Token")
|
||||
+init
|
||||
+method("__init__", "vocab, doc, offset")
|
||||
+params
|
||||
+param("vocab", types.Vocab)
|
||||
p A Vocab object
|
||||
|
||||
+param("doc", types.Doc)
|
||||
p The parent sequence
|
||||
|
||||
+param("offset", types.int)
|
||||
p The index of the token within the document
|
||||
|
||||
details
|
||||
summary: h4 String Views
|
||||
|
||||
+attribute("orth / orth_")
|
||||
| The form of the word with no string normalization or processing, as
|
||||
| it appears in the string, without trailing whitespace.
|
||||
|
||||
+attribute("lemma / lemma_")
|
||||
| The "base" of the word, with no inflectional suffixes, e.g. the lemma of
|
||||
| "developing" is "develop", the lemma of "geese" is "goose", etc. Note that
|
||||
| <em>derivational</em> suffixes are not stripped, e.g. the lemma of
|
||||
| "instutitions" is "institution", not "institute". Lemmatization is
|
||||
| performed using the WordNet data, but extended to also cover closed-class
|
||||
| words such as pronouns. By default, the WN lemmatizer returns "hi"
|
||||
| as the lemma of "his". We assign pronouns the lemma -PRON-.
|
||||
|
||||
+attribute("lower / lower_")
|
||||
| The form of the word, but forced to lower-case, i.e.
|
||||
pre.language-python: code lower = word.orth\_.lower()
|
||||
|
||||
//+attribute("norm / norm_")
|
||||
// | The form of the word, after language-specific normalizations has been
|
||||
// | applied.
|
||||
|
||||
+attribute("shape / shape_")
|
||||
| A transform of the word's string, to show orthographic features.
|
||||
| The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped
|
||||
| to d. After these mappings, sequences of 4 or more of the same character
|
||||
| are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
|
||||
| :) --> :)
|
||||
|
||||
+attribute("prefix / prefix_")
|
||||
| A length-N substring from the start of the word. Length may vary by
|
||||
| language; currently for English n=1, i.e.
|
||||
pre.language-python: code prefix = word.orth\_[:1]
|
||||
|
||||
+attribute("suffix / suffix_")
|
||||
| A length-N substring from the end of the word. Length may vary by
|
||||
| language; currently for English n=3, i.e.
|
||||
pre.language-python: code suffix = word.orth\_[-3:]
|
||||
|
||||
//+attribute("lex_id")
|
||||
// | lex_id
|
||||
|
||||
details
|
||||
summary: h4 Alignment and Output
|
||||
|
||||
+attribute("idx")
|
||||
p Start index of the token in the string
|
||||
|
||||
+method("__len__", "")
|
||||
p Length of the token's orth string, in unicode code-points.
|
||||
|
||||
+method("__unicode__", "")
|
||||
p Same as token.orth_
|
||||
|
||||
+method("__str__", "")
|
||||
p Varies between Python 2 and Python 3
|
||||
|
||||
+attribute("string")
|
||||
p
|
||||
| The form of the word as it appears in the string, <strong>including
|
||||
| trailing whitespace</strong>. This is useful when you need to use
|
||||
| linguistic features to add inline mark-up to the string.
|
||||
|
||||
+method("nbor, i=1")
|
||||
+params
|
||||
+param("i")
|
||||
p Offset relative to token
|
||||
|
||||
details
|
||||
summary: h4 Distributional Features
|
||||
|
||||
+attribute("repvec")
|
||||
p
|
||||
| A "word embedding" representation: a dense real-valued vector that supports
|
||||
| similarity queries between words. By default, spaCy currently loads
|
||||
| vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
|
||||
| model.
|
||||
|
||||
+attribute("cluster")
|
||||
p
|
||||
| The Brown cluster ID of the word. These are often useful features for
|
||||
| linear models. If you're using a non-linear model, particularly a
|
||||
| neural net or random forest, consider using the real-valued word
|
||||
| representation vector, in Token.repvec, instead.
|
||||
|
||||
+attribute("prob")
|
||||
p
|
||||
| The unigram log-probability of the word, estimated from counts from a
|
||||
| large corpus, smoothed using Simple Good Turing estimation.
|
||||
|
||||
details
|
||||
summary: h4 Syntactic Tags
|
||||
|
||||
+attribute("pos / pos_")
|
||||
p
|
||||
| A part-of-speech tag, from the Google Universal Tag Set, e.g.
|
||||
| code>NOUN</code>, <code>VERB</code>, <code>ADV</code>. Constants for
|
||||
| the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
|
||||
|
||||
+attribute("tag / tag_")
|
||||
p
|
||||
| A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
|
||||
| <code>DT</code>, etc. These tags are language/corpus specific, and
|
||||
| typically describe part-of-speech and some amount of morphological
|
||||
| information. For instance, in the Penn Treebank tag set, <code>VBZ</code>
|
||||
| is assigned to a present-tense singular verb.
|
||||
|
||||
+attribute("dep / dep_")
|
||||
p
|
||||
| The type of syntactic dependency relation between the word and its
|
||||
| syntactic head.
|
||||
|
||||
details
|
||||
summary: h4 Navigating the Parse Tree
|
||||
|
||||
+attribute("head")
|
||||
p
|
||||
| The Token that is the immediate syntactic head of the word. If the
|
||||
| word is the root of the dependency tree, the same word is returned.
|
||||
|
||||
+attribute("lefts")
|
||||
p
|
||||
| An iterator for the immediate leftward syntactic children of the
|
||||
| word.
|
||||
|
||||
+attribute("rights")
|
||||
p
|
||||
| An iterator for the immediate rightward syntactic children of the
|
||||
| word.
|
||||
|
||||
+attribute("n_lefts")
|
||||
p
|
||||
| The number of immediate syntactic children preceding the word in
|
||||
| the string.
|
||||
|
||||
+attribute("n_rights")
|
||||
p
|
||||
| The number of immediate syntactic children following the word in
|
||||
| the string.
|
||||
|
||||
+attribute("children")
|
||||
p
|
||||
| An iterator that yields from lefts, and then yields from rights.
|
||||
|
||||
+attribute("subtree")
|
||||
p
|
||||
| An iterator for the part of the sentence syntactically governed by
|
||||
| the word, including the word itself.
|
||||
|
||||
+attribute("left_edge")
|
||||
p The leftmost edge of the token's subtree
|
||||
|
||||
+attribute("right_edge")
|
||||
p The rightmost edge of the token's subtree
|
||||
|
||||
details
|
||||
summary: h4 Named Entities
|
||||
|
||||
+attribute("ent_type")
|
||||
p If the token is part of an entity, its entity type.
|
||||
|
||||
+attribute("ent_iob")
|
||||
p The IOB (inside, outside, begin) entity recognition tag for the token.
|
||||
|
||||
details
|
||||
summary: h4 Lexeme Flags
|
||||
|
||||
+method("check_flag", "flag_id")
|
||||
+params
|
||||
+param("flag_id")
|
||||
| flag ID
|
||||
|
||||
+attribute("is_oov")
|
||||
+attribute("is_alpha")
|
||||
+attribute("is_ascii")
|
||||
+attribute("is_digit")
|
||||
+attribute("is_lower")
|
||||
+attribute("is_title")
|
||||
+attribute("is_punct")
|
||||
+attribute("is_space")
|
||||
+attribute("like_url")
|
||||
+attribute("like_num")
|
||||
+attribute("like_email")
|
||||
|
||||
//+attribute("conjuncts")
|
||||
// | Conjuncts
|
||||
|
||||
+declare_class("Span")
|
||||
+init
|
||||
+method("__init__")
|
||||
Temp
|
||||
|
||||
<code>span = doc[0:4]</code>
|
||||
|
||||
+sequence
|
||||
+method("__getitem__")
|
||||
p Get item
|
||||
|
||||
+method("__iter__")
|
||||
p Iter
|
||||
|
||||
+method("__len__")
|
||||
p Len
|
||||
|
||||
details
|
||||
summary: h4 Parse
|
||||
|
||||
+attribute("root")
|
||||
p Syntactic head
|
||||
|
||||
+attribute("lefts")
|
||||
p Tokens that are:
|
||||
ol
|
||||
li To the left of the span;
|
||||
li Syntactic children of words within the span
|
||||
|
||||
p i.e.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| lefts = [span.doc[i] for i in range(0, span.start)
|
||||
| if span.doc[i].head in span]
|
||||
|
||||
+attribute("rights")
|
||||
p Tokens that are:
|
||||
ol
|
||||
li To the right of the span;
|
||||
li Syntactic children of words within the span
|
||||
p i.e.
|
||||
pre.language-python
|
||||
code
|
||||
| rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
||||
| if span.doc[i].head in span]
|
||||
|
||||
|
||||
+attribute("subtree")
|
||||
p String
|
||||
|
||||
details
|
||||
summary: h4 String Views
|
||||
|
||||
+attribute("string")
|
||||
p String
|
||||
|
||||
+attribute("lemma / lemma_")
|
||||
p String
|
||||
|
||||
+attribute("label / label_")
|
||||
p String
|
||||
|
||||
+declare_class("Lexeme")
|
||||
p
|
||||
| The Lexeme object represents a lexical type, stored in the vocabulary
|
||||
| – as opposed to a token, occurring in a document.
|
||||
p
|
||||
| Lexemes store various features, so that these features can be computed
|
||||
| once per type, rather than once per token. As job sizes grow, this
|
||||
| can amount to a substantial efficiency improvement.
|
||||
|
||||
p
|
||||
| All Lexeme attributes are therefore context independent, as a single
|
||||
| lexeme is reused for all usages of that word. Lexemes are keyed by
|
||||
| the “orth” attribute.
|
||||
|
||||
p
|
||||
All Lexeme attributes are accessible directly on the Token object.
|
||||
|
||||
+init
|
||||
+method("__init__")
|
||||
p Init
|
||||
|
||||
details
|
||||
summary: h4 String Features
|
||||
|
||||
+attribute("orth / orth_")
|
||||
p
|
||||
| The form of the word with no string normalization or processing,
|
||||
| as it appears in the string, without trailing whitespace.
|
||||
|
||||
+attribute("lower / lower_")
|
||||
p Tmp
|
||||
|
||||
+attribute("norm / norm_")
|
||||
p Tmp
|
||||
|
||||
+attribute("shape / shape_")
|
||||
p Tmp
|
||||
|
||||
+attribute("prefix / prefix_")
|
||||
p Tmp
|
||||
|
||||
+attribute("suffix / suffix_")
|
||||
p TMP
|
||||
|
||||
+declare_class("Vocab", "data_dir=None, lex_props_getter=None")
|
||||
+sequence
|
||||
+method("__len__")
|
||||
+returns
|
||||
p Number of words in the vocabulary.
|
||||
|
||||
+method("__iter__")
|
||||
+returns
|
||||
p Lexeme
|
||||
|
||||
+maptype
|
||||
+method("__getitem__", "key_int")
|
||||
+params
|
||||
+param("key")
|
||||
p Integer ID
|
||||
|
||||
+returns: p A Lexeme object
|
||||
|
||||
+method("__getitem__", "key_str")
|
||||
+params
|
||||
+param("key_str", types.unicode)
|
||||
p A string in the vocabulary
|
||||
|
||||
+returns("Lexeme")
|
||||
|
||||
+method("__setitem__", "orth_str", "props")
|
||||
+params
|
||||
+param("orth_str", types.unicode)
|
||||
p The orth key
|
||||
|
||||
+param("props", types.dict)
|
||||
p A props dictionary
|
||||
|
||||
+returns("None")
|
||||
|
||||
details
|
||||
summary: h4 Import/Export
|
||||
|
||||
+method("dump", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path where the vocabulary should be saved
|
||||
|
||||
+method("load_lexemes", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path to load the lexemes.bin file from
|
||||
|
||||
+method("load_vectors", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path to load the vectors.bin from
|
||||
|
||||
+declare_class("StringStore")
|
||||
+init
|
||||
Tmp
|
||||
|
||||
+sequence
|
||||
+method("__len__")
|
||||
+returns("int")
|
||||
p Number of strings in the string-store
|
||||
|
||||
+method("__iter__")
|
||||
+returns
|
||||
p Lexeme
|
||||
|
||||
+maptype
|
||||
+method("__getitem__", "key_int")
|
||||
+params
|
||||
+param("key_int")
|
||||
p An integer key
|
||||
|
||||
+returns(types.unicode)
|
||||
p The string that the integer key maps to
|
||||
|
||||
+method("__getitem__", "key_unicode")
|
||||
+params
|
||||
+param("key_unicode")
|
||||
p A key, as a unicode string
|
||||
|
||||
+returns(types.int)
|
||||
p The integer ID of the string.
|
||||
|
||||
+method("__getitem__", "key_utf8_bytes")
|
||||
+params
|
||||
+param("key_utf8_bytes", types.bytes)
|
||||
p p A key, as a UTF-8 encoded byte-string
|
||||
|
||||
+returns(types.int)
|
||||
p The integer ID of the string.
|
||||
|
||||
details
|
||||
summary: h4 Import/Export
|
||||
|
||||
+method("dump", "loc")
|
||||
+params
|
||||
+param("loc")
|
||||
p File path to save the strings.txt to.
|
||||
|
||||
+method("load")
|
||||
+params
|
||||
+param("loc")
|
||||
p File path to load the strings.txt from.
|
135
docs/redesign/blog.jade
Normal file
135
docs/redesign/blog.jade
Normal file
|
@ -0,0 +1,135 @@
|
|||
mixin Teaser(title, url, date_long, date_short, author, lede)
|
||||
article.post
|
||||
header
|
||||
h2
|
||||
a(href=url)= title
|
||||
.subhead
|
||||
| by
|
||||
a(href='#', rel='author')= author
|
||||
| on
|
||||
time(datetime=date_short)= date_long
|
||||
p!= lede
|
||||
|
||||
a.readmore(href='#') ►
|
||||
|
||||
|
||||
doctype html
|
||||
html(lang='en')
|
||||
head
|
||||
meta(charset='utf-8')
|
||||
title spaCy Blog
|
||||
meta(name='description', content='')
|
||||
meta(name='author', content='Matthew Honnibal')
|
||||
link(rel='stylesheet', href='css/style.css')
|
||||
//if lt IE 9
|
||||
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
|
||||
body#blog
|
||||
header(role='banner')
|
||||
h1.logo spaCy Blog
|
||||
.slogan Blog
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="home.html") Home
|
||||
li: a(href="docs.html") Docs
|
||||
li.active: a(href="blog.html") Blog
|
||||
li: a(href="license.html") License
|
||||
|
||||
main#content(role='main')
|
||||
section.intro.profile
|
||||
p
|
||||
img(src='img/matt.png')
|
||||
| Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore.
|
||||
span.social
|
||||
a(href='#') Follow me on Twitter
|
||||
nav(role='navigation')
|
||||
ul
|
||||
li
|
||||
a.button(href='#') Blog
|
||||
li
|
||||
a.button(href='#tutorials') Tutorials
|
||||
section.blogs
|
||||
+Teaser(
|
||||
"Introducing spaCy",
|
||||
"blog_intro.html",
|
||||
"February 2015",
|
||||
"2015-02-18",
|
||||
"Matthew Honnibal",
|
||||
"<strong>spaCy</strong> is a new library for text processing in Python " +
|
||||
"and Cython. I wrote it because I think small companies are terrible at " +
|
||||
"natural language processing (NLP). Or rather: small companies are using " +
|
||||
"terrible NLP technology."
|
||||
)
|
||||
|
||||
+Teaser(
|
||||
"Parsing English with 500 lines of Python",
|
||||
"blog_parser.html",
|
||||
"December 18, 2013",
|
||||
"2013-12-18",
|
||||
"Matthew Hannibal",
|
||||
"The Natural Language Processing (NLP) community has made big progress" +
|
||||
"in syntactic parsing over the last few years. It’s now possible for a" +
|
||||
"tiny Python implementation to perform better than the widely-used Stanford " +
|
||||
"PCFG parser.")
|
||||
|
||||
article.post
|
||||
header
|
||||
h2
|
||||
a(href='#') Another headline
|
||||
.subhead
|
||||
| by
|
||||
a(href='#', rel='author') Matthew Honnibal
|
||||
| on
|
||||
time(datetime='2013-12-18') December 18, 2013
|
||||
p
|
||||
| The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.
|
||||
a.readmore(href='#') ►
|
||||
article.post
|
||||
header
|
||||
h2
|
||||
a(href='#') Another headline
|
||||
.subhead
|
||||
| by
|
||||
a(href='#', rel='author') Matthew Honnibal
|
||||
| on
|
||||
time(datetime='2013-12-18') December 18, 2013
|
||||
p
|
||||
| The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.
|
||||
a.readmore(href='#') ►
|
||||
article.post
|
||||
header
|
||||
h2
|
||||
a(href='#') Another headline
|
||||
.subhead
|
||||
| by
|
||||
a(href='#', rel='author') Matthew Honnibal
|
||||
| on
|
||||
time(datetime='2013-12-18') December 18, 2013
|
||||
p
|
||||
| The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.
|
||||
a.readmore(href='#') ►
|
||||
.readmore
|
||||
a.button(href='#') Read more posts
|
||||
section.intro
|
||||
h2
|
||||
a.permalink(href='#tutorials', name='tutorials') Tutorials
|
||||
p
|
||||
| Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est.
|
||||
section.tutorials
|
||||
details
|
||||
summary
|
||||
h4 Tutorial #1: How to do something cool
|
||||
p
|
||||
| The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.
|
||||
a.readmore(href='#') ►
|
||||
details
|
||||
summary
|
||||
h4 Tutorial #2
|
||||
details
|
||||
summary
|
||||
h4 Tutorial #3
|
||||
|
||||
footer(role="contentinfo")
|
||||
span.slogan.copyright © 2015 Syllogism Co.
|
||||
|
||||
script(src='js/prism.js')
|
|
@ -1,3 +1,5 @@
|
|||
extends ./template_post.jade
|
||||
|
||||
-
|
||||
var urls = {
|
||||
'pos_post': 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/',
|
||||
|
@ -9,7 +11,6 @@
|
|||
'share_twitter': 'http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal'
|
||||
}
|
||||
|
||||
|
||||
- var my_research_software = '<a href="https://github.com/syllog1sm/redshift/tree/develop">my research software</a>'
|
||||
|
||||
- var how_to_write_a_POS_tagger = '<a href="https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/">how to write a part-of-speech tagger</a>'
|
||||
|
@ -18,76 +19,63 @@
|
|||
|
||||
- var buy_a_commercial_license = '<a href="license.html">buy a commercial license</a>'
|
||||
|
||||
doctype html
|
||||
html(lang='en')
|
||||
head
|
||||
meta(charset='utf-8')
|
||||
title spaCy Blog
|
||||
meta(name='description', content='')
|
||||
meta(name='author', content='Matthew Honnibal')
|
||||
link(rel='stylesheet', href='css/style.css')
|
||||
//if lt IE 9
|
||||
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
|
||||
body#blog
|
||||
header(role='banner')
|
||||
h1.logo spaCy Blog
|
||||
.slogan Blog
|
||||
main#content(role='main')
|
||||
article.post
|
||||
p.
|
||||
<strong>spaCy</strong> is a new library for text processing in Python
|
||||
and Cython. I wrote it because I think small companies are terrible at
|
||||
natural language processing (NLP). Or rather: small companies are using
|
||||
terrible NLP technology.
|
||||
|
||||
p.
|
||||
To do great NLP, you have to know a little about linguistics, a lot
|
||||
about machine learning, and almost everything about the latest research.
|
||||
The people who fit this description seldom join small companies.
|
||||
Most are broke – they've just finished grad school.
|
||||
If they don't want to stay in academia, they join Google, IBM, etc.
|
||||
block body_block
|
||||
article.post
|
||||
p.
|
||||
<strong>spaCy</strong> is a new library for text processing in Python
|
||||
and Cython. I wrote it because I think small companies are terrible at
|
||||
natural language processing (NLP). Or rather: small companies are using
|
||||
terrible NLP technology.
|
||||
|
||||
p.
|
||||
The net result is that outside of the tech giants, commercial NLP has
|
||||
changed little in the last ten years. In academia, it's changed entirely.
|
||||
Amazing improvements in quality. Orders of magnitude faster. But the
|
||||
academic code is always GPL, undocumented, unuseable, or all three.
|
||||
You could implement the ideas yourself, but the papers are hard to read,
|
||||
and training data is exorbitantly expensive. So what are you left with?
|
||||
A common answer is NLTK, which was written primarily as an educational resource.
|
||||
Nothing past the tokenizer is suitable for production use.
|
||||
p.
|
||||
To do great NLP, you have to know a little about linguistics, a lot
|
||||
about machine learning, and almost everything about the latest research.
|
||||
The people who fit this description seldom join small companies.
|
||||
Most are broke – they've just finished grad school.
|
||||
If they don't want to stay in academia, they join Google, IBM, etc.
|
||||
|
||||
p.
|
||||
I used to think that the NLP community just needed to do more to communicate
|
||||
its findings to software engineers. So I wrote two blog posts, explaining
|
||||
!{how_to_write_a_POS_tagger} and !{parser_lnk}. Both were well
|
||||
received, and there's been a bit of interest in !{my_research_software}
|
||||
– even though it's entirely undocumented, and mostly unuseable to
|
||||
anyone but me.
|
||||
p.
|
||||
So six months ago I quit my post-doc, and I've been working day and night
|
||||
on spaCy since. I'm now pleased to announce an alpha release.
|
||||
p.
|
||||
The net result is that outside of the tech giants, commercial NLP has
|
||||
changed little in the last ten years. In academia, it's changed entirely.
|
||||
Amazing improvements in quality. Orders of magnitude faster. But the
|
||||
academic code is always GPL, undocumented, unuseable, or all three.
|
||||
You could implement the ideas yourself, but the papers are hard to read,
|
||||
and training data is exorbitantly expensive. So what are you left with?
|
||||
A common answer is NLTK, which was written primarily as an educational resource.
|
||||
Nothing past the tokenizer is suitable for production use.
|
||||
|
||||
p.
|
||||
If you're a small company doing NLP, I think spaCy will seem like a minor
|
||||
miracle. It's by far the fastest NLP software ever released. The
|
||||
full processing pipeline completes in 20ms per document, including accurate
|
||||
tagging and parsing. All strings are mapped to integer IDs, tokens are
|
||||
linked to embedded word representations, and a range of useful features
|
||||
are pre-calculated and cached.
|
||||
p.
|
||||
I used to think that the NLP community just needed to do more to communicate
|
||||
its findings to software engineers. So I wrote two blog posts, explaining
|
||||
!{how_to_write_a_POS_tagger} and !{parser_lnk}. Both were well
|
||||
received, and there's been a bit of interest in !{my_research_software}
|
||||
– even though it's entirely undocumented, and mostly unuseable to
|
||||
anyone but me.
|
||||
p.
|
||||
So six months ago I quit my post-doc, and I've been working day and night
|
||||
on spaCy since. I'm now pleased to announce an alpha release.
|
||||
|
||||
p.
|
||||
If none of that made any sense to you, here's the gist of it. Computers
|
||||
don't understand text. This is unfortunate, because that's what the
|
||||
web almost entirely consists of. We want to recommend people text based
|
||||
on other text they liked. We want to shorten text to display it on a
|
||||
mobile screen. We want to aggregate it, link it, filter it, categorise
|
||||
it, generate it and correct it.
|
||||
p.
|
||||
If you're a small company doing NLP, I think spaCy will seem like a minor
|
||||
miracle. It's by far the fastest NLP software ever released. The
|
||||
full processing pipeline completes in 20ms per document, including accurate
|
||||
tagging and parsing. All strings are mapped to integer IDs, tokens are
|
||||
linked to embedded word representations, and a range of useful features
|
||||
are pre-calculated and cached.
|
||||
|
||||
p.
|
||||
spaCy provides a library of utility functions that help programmers
|
||||
build such products. It's commercial open source software: you can
|
||||
either use it under the AGPL, or you can !{buy_a_commercial_license}
|
||||
under generous terms.
|
||||
p.
|
||||
If none of that made any sense to you, here's the gist of it. Computers
|
||||
don't understand text. This is unfortunate, because that's what the
|
||||
web almost entirely consists of. We want to recommend people text based
|
||||
on other text they liked. We want to shorten text to display it on a
|
||||
mobile screen. We want to aggregate it, link it, filter it, categorise
|
||||
it, generate it and correct it.
|
||||
|
||||
p.
|
||||
spaCy provides a library of utility functions that help programmers
|
||||
build such products. It's commercial open source software: you can
|
||||
either use it under the AGPL, or you can !{buy_a_commercial_license}
|
||||
under generous terms.
|
||||
|
||||
footer(role='contentinfo')
|
||||
|
|
File diff suppressed because it is too large
Load Diff
0
docs/redesign/change_log.jade
Normal file
0
docs/redesign/change_log.jade
Normal file
78
docs/redesign/comparisons.jade
Normal file
78
docs/redesign/comparisons.jade
Normal file
|
@ -0,0 +1,78 @@
|
|||
+comparison("NLTK")
|
||||
//+comparison("Pattern")
|
||||
+comparison("CoreNLP")
|
||||
+comparison("ClearNLP")
|
||||
//+comparison("OpenNLP")
|
||||
//+comparison("GATE")
|
||||
|
||||
+comparison("Accuracy Summary")
|
||||
|
||||
+comparison("Speed Summary")
|
||||
table
|
||||
thead
|
||||
tr
|
||||
th.
|
||||
th(colspan=3) Absolute (ms per doc)
|
||||
th(colspan=3) Relative (to spaCy)
|
||||
|
||||
tbody
|
||||
tr
|
||||
td: strong System
|
||||
td: strong Split
|
||||
td: strong Tag
|
||||
td: strong Parse
|
||||
td: strong Split
|
||||
td: strong Tag
|
||||
td: strong Parse
|
||||
|
||||
+row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
|
||||
+row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
|
||||
+row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x")
|
||||
+row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x")
|
||||
+row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a")
|
||||
|
||||
p
|
||||
| <strong>Set up</strong>: 100,000 plain-text documents were streamed
|
||||
| from an SQLite3 database, and processed with an NLP library, to one
|
||||
| of three levels of detail – tokenization, tagging, or parsing.
|
||||
| The tasks are additive: to parse the text you have to tokenize and
|
||||
| tag it. The pre-processing was not subtracted from the times –
|
||||
| I report the time required for the pipeline to complete. I report
|
||||
| mean times per document, in milliseconds.
|
||||
|
||||
p
|
||||
| <strong>Hardware</strong>: Intel i7-3770 (2012)
|
||||
|
||||
|
||||
+comparison("Independent Evaluation")
|
||||
p
|
||||
| Independent evaluation by Yahoo! Labs and Emory
|
||||
| University, to appear at ACL 2015. Higher is better.
|
||||
|
||||
table
|
||||
thead
|
||||
+columns("System", "Language", "Accuracy", "Speed")
|
||||
|
||||
tbody
|
||||
+row("spaCy v0.86", "Cython", "91.9", "13,963")
|
||||
+row("spaCy v0.84", "Cython", "90.6", "13,963")
|
||||
+row("ClearNLP", "Java", "91.7", "10,271")
|
||||
+row("CoreNLP", "Java", "89.6", "8,602")
|
||||
+row("MATE", "Java", "92.5", "550")
|
||||
+row("Turbo", "C++", "92.4", "349")
|
||||
+row("Yara", "Java", "92.3", "340")
|
||||
|
||||
p
|
||||
| Accuracy is % unlabelled arcs correct, speed is tokens per second.
|
||||
|
||||
p
|
||||
| Joel Tetreault and Amanda Stent (Yahoo! Labs) and Jin-ho Choi (Emory)
|
||||
| performed a detailed comparison of the best parsers available.
|
||||
| All numbers above are taken from the pre-print they kindly made
|
||||
| available to me, except for spaCy v0.86.
|
||||
|
||||
p
|
||||
| I'm particularly grateful to the authors for discussion of their
|
||||
| results, which led to the improvement in accuracy between v0.84 and
|
||||
| v0.86. A tip from Jin-ho developer of ClearNLP) was particularly
|
||||
| useful.
|
|
@ -1,15 +1,6 @@
|
|||
- var py_docs = '<a class="reference" href="http://docs.python.org/library/'
|
||||
extends ./outline.jade
|
||||
|
||||
-
|
||||
var types = {
|
||||
'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
|
||||
'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
|
||||
'int': py_docs + 'functions.html#int"><em>int</em></a>',
|
||||
'generator': "",
|
||||
'Vocab': "",
|
||||
'Span': "",
|
||||
'Doc': ""
|
||||
}
|
||||
include ./mixins.jade
|
||||
|
||||
|
||||
mixin declare_class(name)
|
||||
|
@ -107,599 +98,32 @@ mixin en_example
|
|||
| doc = nlp(unprocessed_unicode)
|
||||
|
||||
|
||||
doctype html
|
||||
html(lang="en")
|
||||
head
|
||||
meta(charset="utf-8")
|
||||
title spaCy – Industrial-strength NLP
|
||||
meta(name="description" content="")
|
||||
meta(name="author" content="Matthew Honnibal")
|
||||
link(rel="stylesheet" href="css/style.css")
|
||||
<!--[if lt IE 9]>
|
||||
script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js")
|
||||
<![endif]-->
|
||||
|
||||
body(id="docs")
|
||||
header(role="banner")
|
||||
h1.logo spaCy – Industrial-strength NLP
|
||||
div.slogan API
|
||||
|
||||
block intro_block
|
||||
section(class="intro")
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="#") Home
|
||||
li.active: a(href="#") Docs
|
||||
li: a(href="#") License
|
||||
li: a(href="#") Blog
|
||||
|
||||
main.docs#content
|
||||
|
||||
article
|
||||
+declare_class("English")
|
||||
p Load models into a callable object to process English text.
|
||||
|
||||
+summary
|
||||
+en_example
|
||||
|
||||
+init
|
||||
p
|
||||
| Load the resources. Loading takes 20 seconds, and the instance
|
||||
| consumes 2 to 3 gigabytes of memory.
|
||||
|
||||
p
|
||||
| Intended use is for one instance to be created per process.
|
||||
| You can create more if you're doing something unusual.
|
||||
p
|
||||
| You may wish to make the instance a global variable or "singleton".
|
||||
| We usually instantiate the object in the <code>main()</code>
|
||||
| function and pass it around as an explicit argument.
|
||||
+method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
|
||||
|
||||
+params
|
||||
+param("data_dir")
|
||||
| The data directory. May be #{None}, to disable any data loading
|
||||
| (including the vocabulary).
|
||||
|
||||
+param("Tokenizer")
|
||||
| A class/function that creates the tokenizer.
|
||||
|
||||
+param("Tagger")
|
||||
| A class/function that creates the part-of-speech tagger.
|
||||
|
||||
+param("Parser")
|
||||
| A class/function that creates the dependency parser.
|
||||
|
||||
+param("Entity")
|
||||
| A class/function that creates the named entity recogniser.
|
||||
|
||||
+param("load_vectors")
|
||||
| A boolean value to control whether the word vectors are loaded.
|
||||
|
||||
+callable
|
||||
+method("__call__", "text, tag=True, parse=True, entity=True")
|
||||
|
||||
+params
|
||||
+param("text", types.unicode)
|
||||
| The text to be processed. No pre-processing needs to be applied,
|
||||
| and any length of text can be submitted. Usually you will submit
|
||||
| a whole document. Text may be zero-length. An exception is raised
|
||||
| if byte strings are supplied.
|
||||
|
||||
+param("tag", bool_type)
|
||||
| Whether to apply the part-of-speech tagger. Required for parsing
|
||||
| and entity recognition.
|
||||
|
||||
+param("parse", bool_type)
|
||||
| Whether to apply the syntactic dependency parser.
|
||||
|
||||
+param("entity", bool_type)
|
||||
| Whether to apply the named entity recognizer.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| nlp = English()
|
||||
| doc = nlp(u'Some text.) # Applies tagger, parser, entity
|
||||
| doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
|
||||
| doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
|
||||
| doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
|
||||
| doc = nlp(u'') # Zero-length tokens, not an error
|
||||
| # doc = nlp(b'Some text') <-- Error: need unicode
|
||||
| doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
||||
|
||||
|
||||
+declare_class("Doc")
|
||||
p I'm a doc
|
||||
|
||||
+init
|
||||
+method("__init__", "vocab")
|
||||
+params
|
||||
+param("vocab", vocab_type)
|
||||
| A vocabulary object
|
||||
|
||||
+sequence
|
||||
+method("__getitem__", "i", types.int)
|
||||
+returns(types.Token)
|
||||
|
||||
+method("__getitem__", "start_end", types.slice)
|
||||
+returns(types.Span)
|
||||
|
||||
+method("__iter__")
|
||||
| Iterate over tokens
|
||||
|
||||
+method("__len__")
|
||||
| Number of tokens in the document.
|
||||
|
||||
details
|
||||
summary: h4 Spans
|
||||
|
||||
+attribute("sents", types.generator)
|
||||
| Iterate over sentences in the document.
|
||||
|
||||
+attribute("ents", types.generator)
|
||||
| Iterate over named entities in the document.
|
||||
|
||||
+attribute("noun_chunks", types.generator)
|
||||
|
||||
details
|
||||
summary: h4 Export/Import
|
||||
|
||||
+method("to_array", "attr_ids")
|
||||
|
||||
| Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
||||
| of shape N*M, where N is the length of the sentence.
|
||||
|
||||
+params
|
||||
+param("attr_ids", "list[int]")
|
||||
| A list of attribute ID ints.
|
||||
|
||||
+returns("feat_array")
|
||||
| A feature matrix, with one row per word, and one column per attribute
|
||||
| indicated in the input attr_ids.
|
||||
|
||||
+method("count_by", "attr_id")
|
||||
| Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||
| by the values of the given attribute ID.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> from spacy.en import English, attrs
|
||||
| >>> nlp = English()
|
||||
| >>> tokens = nlp(u'apple apple orange banana')
|
||||
| >>> tokens.count_by(attrs.ORTH)
|
||||
| {12800L: 1, 11880L: 2, 7561L: 1}
|
||||
| >>> tokens.to_array([attrs.ORTH])
|
||||
| array([[11880],
|
||||
| [11880],
|
||||
| [7561],
|
||||
| [12800]])
|
||||
|
||||
+method("from_array", "attrs, array")
|
||||
| Load from array
|
||||
|
||||
+method("from_bytes")
|
||||
| Deserialize, loading from bytes
|
||||
|
||||
+method("read_bytes")
|
||||
| classmethod
|
||||
|
||||
//+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
|
||||
|
||||
// | Merge a multi-word expression into a single token. Currently
|
||||
// | experimental; API is likely to change.
|
||||
|
||||
|
||||
+declare_class("Token")
|
||||
+init
|
||||
+method("__init__", "vocab, doc, offset")
|
||||
+params
|
||||
+param("vocab", types.Vocab)
|
||||
p A Vocab object
|
||||
|
||||
+param("doc", types.Doc)
|
||||
p The parent sequence
|
||||
|
||||
+param("offset", types.int)
|
||||
p The index of the token within the document
|
||||
|
||||
details
|
||||
summary: h4 String Views
|
||||
|
||||
+attribute("orth / orth_")
|
||||
| The form of the word with no string normalization or processing, as
|
||||
| it appears in the string, without trailing whitespace.
|
||||
|
||||
+attribute("lemma / lemma_")
|
||||
| The "base" of the word, with no inflectional suffixes, e.g. the lemma of
|
||||
| "developing" is "develop", the lemma of "geese" is "goose", etc. Note that
|
||||
| <em>derivational</em> suffixes are not stripped, e.g. the lemma of
|
||||
| "instutitions" is "institution", not "institute". Lemmatization is
|
||||
| performed using the WordNet data, but extended to also cover closed-class
|
||||
| words such as pronouns. By default, the WN lemmatizer returns "hi"
|
||||
| as the lemma of "his". We assign pronouns the lemma -PRON-.
|
||||
|
||||
+attribute("lower / lower_")
|
||||
| The form of the word, but forced to lower-case, i.e.
|
||||
pre.language-python: code lower = word.orth\_.lower()
|
||||
|
||||
//+attribute("norm / norm_")
|
||||
// | The form of the word, after language-specific normalizations has been
|
||||
// | applied.
|
||||
|
||||
+attribute("shape / shape_")
|
||||
| A transform of the word's string, to show orthographic features.
|
||||
| The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped
|
||||
| to d. After these mappings, sequences of 4 or more of the same character
|
||||
| are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
|
||||
| :) --> :)
|
||||
|
||||
+attribute("prefix / prefix_")
|
||||
| A length-N substring from the start of the word. Length may vary by
|
||||
| language; currently for English n=1, i.e.
|
||||
pre.language-python: code prefix = word.orth\_[:1]
|
||||
|
||||
+attribute("suffix / suffix_")
|
||||
| A length-N substring from the end of the word. Length may vary by
|
||||
| language; currently for English n=3, i.e.
|
||||
pre.language-python: code suffix = word.orth\_[-3:]
|
||||
|
||||
//+attribute("lex_id")
|
||||
// | lex_id
|
||||
|
||||
details
|
||||
summary: h4 Alignment and Output
|
||||
|
||||
+attribute("idx")
|
||||
p Start index of the token in the string
|
||||
|
||||
+method("__len__", "")
|
||||
p Length of the token's orth string, in unicode code-points.
|
||||
|
||||
+method("__unicode__", "")
|
||||
p Same as token.orth_
|
||||
|
||||
+method("__str__", "")
|
||||
p Varies between Python 2 and Python 3
|
||||
|
||||
+attribute("string")
|
||||
p
|
||||
| The form of the word as it appears in the string, <strong>including
|
||||
| trailing whitespace</strong>. This is useful when you need to use
|
||||
| linguistic features to add inline mark-up to the string.
|
||||
|
||||
+method("nbor, i=1")
|
||||
+params
|
||||
+param("i")
|
||||
p Offset relative to token
|
||||
|
||||
details
|
||||
summary: h4 Distributional Features
|
||||
|
||||
+attribute("repvec")
|
||||
p
|
||||
| A "word embedding" representation: a dense real-valued vector that supports
|
||||
| similarity queries between words. By default, spaCy currently loads
|
||||
| vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
|
||||
| model.
|
||||
|
||||
+attribute("cluster")
|
||||
p
|
||||
| The Brown cluster ID of the word. These are often useful features for
|
||||
| linear models. If you're using a non-linear model, particularly a
|
||||
| neural net or random forest, consider using the real-valued word
|
||||
| representation vector, in Token.repvec, instead.
|
||||
|
||||
+attribute("prob")
|
||||
p
|
||||
| The unigram log-probability of the word, estimated from counts from a
|
||||
| large corpus, smoothed using Simple Good Turing estimation.
|
||||
|
||||
details
|
||||
summary: h4 Syntactic Tags
|
||||
|
||||
+attribute("pos / pos_")
|
||||
p
|
||||
| A part-of-speech tag, from the Google Universal Tag Set, e.g.
|
||||
| code>NOUN</code>, <code>VERB</code>, <code>ADV</code>. Constants for
|
||||
| the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
|
||||
|
||||
+attribute("tag / tag_")
|
||||
p
|
||||
| A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
|
||||
| <code>DT</code>, etc. These tags are language/corpus specific, and
|
||||
| typically describe part-of-speech and some amount of morphological
|
||||
| information. For instance, in the Penn Treebank tag set, <code>VBZ</code>
|
||||
| is assigned to a present-tense singular verb.
|
||||
|
||||
+attribute("dep / dep_")
|
||||
p
|
||||
| The type of syntactic dependency relation between the word and its
|
||||
| syntactic head.
|
||||
|
||||
details
|
||||
summary: h4 Navigating the Parse Tree
|
||||
|
||||
+attribute("head")
|
||||
p
|
||||
| The Token that is the immediate syntactic head of the word. If the
|
||||
| word is the root of the dependency tree, the same word is returned.
|
||||
|
||||
+attribute("lefts")
|
||||
p
|
||||
| An iterator for the immediate leftward syntactic children of the
|
||||
| word.
|
||||
|
||||
+attribute("rights")
|
||||
p
|
||||
| An iterator for the immediate rightward syntactic children of the
|
||||
| word.
|
||||
|
||||
+attribute("n_lefts")
|
||||
p
|
||||
| The number of immediate syntactic children preceding the word in
|
||||
| the string.
|
||||
|
||||
+attribute("n_rights")
|
||||
p
|
||||
| The number of immediate syntactic children following the word in
|
||||
| the string.
|
||||
|
||||
+attribute("children")
|
||||
p
|
||||
| An iterator that yields from lefts, and then yields from rights.
|
||||
|
||||
+attribute("subtree")
|
||||
p
|
||||
| An iterator for the part of the sentence syntactically governed by
|
||||
| the word, including the word itself.
|
||||
|
||||
+attribute("left_edge")
|
||||
p The leftmost edge of the token's subtree
|
||||
|
||||
+attribute("right_edge")
|
||||
p The rightmost edge of the token's subtree
|
||||
|
||||
details
|
||||
summary: h4 Named Entities
|
||||
|
||||
+attribute("ent_type")
|
||||
p If the token is part of an entity, its entity type.
|
||||
|
||||
+attribute("ent_iob")
|
||||
p The IOB (inside, outside, begin) entity recognition tag for the token.
|
||||
|
||||
details
|
||||
summary: h4 Lexeme Flags
|
||||
|
||||
+method("check_flag", "flag_id")
|
||||
+params
|
||||
+param("flag_id")
|
||||
| flag ID
|
||||
|
||||
+attribute("is_oov")
|
||||
+attribute("is_alpha")
|
||||
+attribute("is_ascii")
|
||||
+attribute("is_digit")
|
||||
+attribute("is_lower")
|
||||
+attribute("is_title")
|
||||
+attribute("is_punct")
|
||||
+attribute("is_space")
|
||||
+attribute("like_url")
|
||||
+attribute("like_num")
|
||||
+attribute("like_email")
|
||||
|
||||
//+attribute("conjuncts")
|
||||
// | Conjuncts
|
||||
|
||||
+declare_class("Span")
|
||||
+init
|
||||
+method("__init__")
|
||||
Temp
|
||||
|
||||
<code>span = doc[0:4]</code>
|
||||
|
||||
+sequence
|
||||
+method("__getitem__")
|
||||
p Get item
|
||||
|
||||
+method("__iter__")
|
||||
p Iter
|
||||
|
||||
+method("__len__")
|
||||
p Len
|
||||
|
||||
details
|
||||
summary: h4 Parse
|
||||
|
||||
+attribute("root")
|
||||
p Syntactic head
|
||||
|
||||
+attribute("lefts")
|
||||
p Tokens that are:
|
||||
ol
|
||||
li To the left of the span;
|
||||
li Syntactic children of words within the span
|
||||
|
||||
p i.e.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| lefts = [span.doc[i] for i in range(0, span.start)
|
||||
| if span.doc[i].head in span]
|
||||
|
||||
+attribute("rights")
|
||||
p Tokens that are:
|
||||
ol
|
||||
li To the right of the span;
|
||||
li Syntactic children of words within the span
|
||||
p i.e.
|
||||
pre.language-python
|
||||
code
|
||||
| rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
||||
| if span.doc[i].head in span]
|
||||
|
||||
|
||||
+attribute("subtree")
|
||||
p String
|
||||
|
||||
details
|
||||
summary: h4 String Views
|
||||
|
||||
+attribute("string")
|
||||
p String
|
||||
|
||||
+attribute("lemma / lemma_")
|
||||
p String
|
||||
|
||||
+attribute("label / label_")
|
||||
p String
|
||||
|
||||
+declare_class("Lexeme")
|
||||
p
|
||||
| The Lexeme object represents a lexical type, stored in the vocabulary
|
||||
| – as opposed to a token, occurring in a document.
|
||||
p
|
||||
| Lexemes store various features, so that these features can be computed
|
||||
| once per type, rather than once per token. As job sizes grow, this
|
||||
| can amount to a substantial efficiency improvement.
|
||||
|
||||
p
|
||||
| All Lexeme attributes are therefore context independent, as a single
|
||||
| lexeme is reused for all usages of that word. Lexemes are keyed by
|
||||
| the “orth” attribute.
|
||||
|
||||
p
|
||||
All Lexeme attributes are accessible directly on the Token object.
|
||||
|
||||
+init
|
||||
+method("__init__")
|
||||
p Init
|
||||
|
||||
details
|
||||
summary: h4 String Features
|
||||
|
||||
+attribute("orth / orth_")
|
||||
p
|
||||
| The form of the word with no string normalization or processing,
|
||||
| as it appears in the string, without trailing whitespace.
|
||||
|
||||
+attribute("lower / lower_")
|
||||
p Tmp
|
||||
|
||||
+attribute("norm / norm_")
|
||||
p Tmp
|
||||
|
||||
+attribute("shape / shape_")
|
||||
p Tmp
|
||||
|
||||
+attribute("prefix / prefix_")
|
||||
p Tmp
|
||||
|
||||
+attribute("suffix / suffix_")
|
||||
p TMP
|
||||
|
||||
+declare_class("Vocab", "data_dir=None, lex_props_getter=None")
|
||||
+sequence
|
||||
+method("__len__")
|
||||
+returns
|
||||
p Number of words in the vocabulary.
|
||||
|
||||
+method("__iter__")
|
||||
+returns
|
||||
p Lexeme
|
||||
|
||||
+maptype
|
||||
+method("__getitem__", "key_int")
|
||||
+params
|
||||
+param("key")
|
||||
p Integer ID
|
||||
|
||||
+returns: p A Lexeme object
|
||||
|
||||
+method("__getitem__", "key_str")
|
||||
+params
|
||||
+param("key_str", types.unicode)
|
||||
p A string in the vocabulary
|
||||
|
||||
+returns("Lexeme")
|
||||
|
||||
+method("__setitem__", "orth_str", "props")
|
||||
+params
|
||||
+param("orth_str", types.unicode)
|
||||
p The orth key
|
||||
|
||||
+param("props", types.dict)
|
||||
p A props dictionary
|
||||
|
||||
+returns("None")
|
||||
|
||||
details
|
||||
summary: h4 Import/Export
|
||||
|
||||
+method("dump", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path where the vocabulary should be saved
|
||||
|
||||
+method("load_lexemes", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path to load the lexemes.bin file from
|
||||
|
||||
+method("load_vectors", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path to load the vectors.bin from
|
||||
|
||||
+declare_class("StringStore")
|
||||
+init
|
||||
Tmp
|
||||
|
||||
+sequence
|
||||
+method("__len__")
|
||||
+returns("int")
|
||||
p Number of strings in the string-store
|
||||
|
||||
+method("__iter__")
|
||||
+returns
|
||||
p Lexeme
|
||||
|
||||
+maptype
|
||||
+method("__getitem__", "key_int")
|
||||
+params
|
||||
+param("key_int")
|
||||
p An integer key
|
||||
|
||||
+returns(types.unicode)
|
||||
p The string that the integer key maps to
|
||||
|
||||
+method("__getitem__", "key_unicode")
|
||||
+params
|
||||
+param("key_unicode")
|
||||
p A key, as a unicode string
|
||||
|
||||
+returns(types.int)
|
||||
p The integer ID of the string.
|
||||
|
||||
+method("__getitem__", "key_utf8_bytes")
|
||||
+params
|
||||
+param("key_utf8_bytes", types.bytes)
|
||||
p p A key, as a UTF-8 encoded byte-string
|
||||
|
||||
+returns(types.int)
|
||||
p The integer ID of the string.
|
||||
|
||||
details
|
||||
summary: h4 Import/Export
|
||||
|
||||
+method("dump", "loc")
|
||||
+params
|
||||
+param("loc")
|
||||
p File path to save the strings.txt to.
|
||||
|
||||
+method("load")
|
||||
+params
|
||||
+param("loc")
|
||||
p File path to load the strings.txt from.
|
||||
|
||||
script(src="js/prism.js")
|
||||
li: a(href="#api" class="button") API
|
||||
li: a(href="#tutorials" class="button") Tutorials
|
||||
li: a(href="#spec" class="button") Spec
|
||||
|
||||
|
||||
block body_block
|
||||
- var py_docs = '<a class="reference" href="http://docs.python.org/library/'
|
||||
|
||||
-
|
||||
var types = {
|
||||
'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
|
||||
'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
|
||||
'int': py_docs + 'functions.html#int"><em>int</em></a>',
|
||||
'generator': "",
|
||||
'Vocab': "",
|
||||
'Span': "",
|
||||
'Doc': ""
|
||||
}
|
||||
|
||||
article
|
||||
|
||||
+Section("API", "api", "api.jade")
|
||||
+Section("Tutorals", "tutorials", "tutorials.jade")
|
||||
+Section("Annotation Specifications", "spec", "spec.jade")
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
extends ./outline.jade
|
||||
|
||||
include ./mixins.jade
|
||||
|
||||
// Notes
|
||||
//
|
||||
// 1. Where to put version notice? Should say something like
|
||||
|
@ -16,11 +18,13 @@ mixin lede
|
|||
- var state_of_the_art = '<a href="#">state-of-the-art</a>'
|
||||
- var a_minor_miracle = '<a href="">a minor miracle</a>'
|
||||
- var great_documentation = '<a href="">great documentation</a>'
|
||||
- var concise_API = '<a href="">concise API</a>'
|
||||
|
||||
p.
|
||||
<a href="https://github.com/honnibal/spaCy"><strong>spaCy</strong></a> is a
|
||||
library for industrial-strength NLP in Python and Cython. It features
|
||||
!{state_of_the_art} speed and accuracy, a concise API, and great documentation.
|
||||
library for industrial-strength natural language processing in Python and
|
||||
Cython. It features !{state_of_the_art} speed and accuracy, a !{concise_API},
|
||||
and <a href="#license">license terms</a> designed to get out of your way.
|
||||
If you're a small company doing NLP, we want <strong>spaCy</strong> to seem
|
||||
like !{a_minor_miracle}.
|
||||
|
||||
|
@ -36,7 +40,6 @@ mixin get_started()
|
|||
p.
|
||||
Get Started
|
||||
|
||||
|
||||
mixin comparison(name)
|
||||
details
|
||||
summary
|
||||
|
@ -68,19 +71,6 @@ mixin social
|
|||
| Discuss on Reddit
|
||||
|
||||
|
||||
mixin Section(title_text, link_name, include_file)
|
||||
a(name=link_name): h3 #{title_text}
|
||||
|
||||
if (link_name == "example-use")
|
||||
include ./usage_examples.jade
|
||||
else if (link_name == "online-demo")
|
||||
include ./online_demo.jade
|
||||
else if (link_name == "comparisons")
|
||||
include ./comparisons.jade
|
||||
else if (link_name == "install")
|
||||
include ./installation.jade
|
||||
|
||||
|
||||
block intro_block
|
||||
section(class="intro")
|
||||
+lede
|
||||
|
@ -90,7 +80,9 @@ block intro_block
|
|||
li: a(href="#example-use" class="button") Examples
|
||||
li: a(href="#online-demo" class="button") Demo
|
||||
li: a(href="#comparisons" class="button") Comparisons
|
||||
li: a(href="#install" class="button") Install v0.89
|
||||
li: a(href="#install" class="button")
|
||||
| Install
|
||||
<span class="button-caption">v0.89</span>
|
||||
|
||||
|
||||
block body_block
|
||||
|
@ -103,4 +95,3 @@ block body_block
|
|||
+Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade")
|
||||
|
||||
+Section("Install", "install", "./install.jade")
|
||||
|
||||
|
|
|
@ -1,40 +1,71 @@
|
|||
p With Python 2.7 or Python 3, using Linux or OSX, run:
|
||||
mixin Option(name, open)
|
||||
details(open=open)
|
||||
summary
|
||||
h4= name
|
||||
block
|
||||
|
||||
pre.language-bash: code
|
||||
| $ pip install spacy
|
||||
| $ python -m spacy.en.download</code></pre>
|
||||
+Option("conda", true)
|
||||
pre.language-bash: code
|
||||
| $ conda install spacy
|
||||
| $ python -m spacy.en.download
|
||||
|
||||
p
|
||||
| The download command fetches and installs about 300mb of data, for
|
||||
| the parser model and word vectors, which it installs within the spacy.en
|
||||
| package directory.
|
||||
+Option("pip and virtualenv", true)
|
||||
p With Python 2.7 or Python 3, using Linux or OSX, run:
|
||||
|
||||
p
|
||||
| If you're stuck using a server with an old version of Python, and you
|
||||
| don't have root access, I've prepared a bootstrap script to help you
|
||||
| compile a local Python install. Run:
|
||||
pre.language-bash: code
|
||||
| $ pip install spacy
|
||||
| $ python -m spacy.en.download
|
||||
|
||||
pre.language-bash: code
|
||||
| $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
|
||||
p
|
||||
| The download command fetches and installs about 300mb of data, for
|
||||
| the parser model and word vectors, which it installs within the spacy.en
|
||||
| package directory.
|
||||
|
||||
p
|
||||
| The other way to install the package is to clone the github repository,
|
||||
| and build it from source. This installs an additional dependency,
|
||||
| Cython. If you're using Python 2, I also recommend installing fabric
|
||||
| and fabtools – this is how I build the project.
|
||||
|
||||
pre.language-bash: code
|
||||
| $ git clone https://github.com/honnibal/spaCy.git
|
||||
| $ cd spaCy
|
||||
| $ virtualenv .env && source .env/bin/activate
|
||||
| $ export PYTHONPATH=`pwd`
|
||||
| $ pip install -r requirements.txt
|
||||
| $ python setup.py build_ext --inplace
|
||||
| $ python -m spacy.en.download
|
||||
| $ pip install pytest
|
||||
| $ py.test tests/
|
||||
+Option("Workaround for obsolete system Python", false)
|
||||
p
|
||||
| If you're stuck using a server with an old version of Python, and you
|
||||
| don't have root access, I've prepared a bootstrap script to help you
|
||||
| compile a local Python install. Run:
|
||||
|
||||
pre.language-bash: code
|
||||
| $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
|
||||
|
||||
|
||||
|
||||
+Option("Compile from source", false)
|
||||
p
|
||||
| The other way to install the package is to clone the github repository,
|
||||
| and build it from source. This installs an additional dependency,
|
||||
| Cython. If you're using Python 2, I also recommend installing fabric
|
||||
| and fabtools – this is how I build the project.
|
||||
|
||||
pre.language-bash: code
|
||||
| $ git clone https://github.com/honnibal/spaCy.git
|
||||
| $ cd spaCy
|
||||
| $ virtualenv .env && source .env/bin/activate
|
||||
| $ export PYTHONPATH=`pwd`
|
||||
| $ pip install -r requirements.txt
|
||||
| $ python setup.py build_ext --inplace
|
||||
| $ python -m spacy.en.download
|
||||
| $ pip install pytest
|
||||
| $ py.test tests/
|
||||
|
||||
p
|
||||
| Python packaging is awkward at the best of times, and it's particularly tricky
|
||||
| with C extensions, built via Cython, requiring large data files. So,
|
||||
| please report issues as you encounter them.
|
||||
|
||||
+Option("pypy (Unsupported)")
|
||||
| If PyPy support is a priority for you, please get in touch. We could likely
|
||||
| fix the remaining issues, if necessary. However, the library is likely to
|
||||
| be much slower on PyPy, as it's written in Cython, which produces code tuned
|
||||
| for the performance of CPython.
|
||||
|
||||
+Option("Windows (Unsupported)")
|
||||
| Unfortunately we don't currently have access to a Windows machine, and have
|
||||
| no experience developing on a MicroSoft stack. In theory the only problems are
|
||||
| with the installation and packaging – there should be no deep platform
|
||||
| dependency. Unfortunately we can't debug these issues at present, simply due
|
||||
| to lack of a development environment.
|
||||
|
||||
p
|
||||
| Python packaging is awkward at the best of times, and it's particularly tricky
|
||||
| with C extensions, built via Cython, requiring large data files. So,
|
||||
| please report issues as you encounter them.
|
||||
|
|
179
docs/redesign/license.jade
Normal file
179
docs/redesign/license.jade
Normal file
|
@ -0,0 +1,179 @@
|
|||
extends ./outline.jade
|
||||
|
||||
mixin columns(...names)
|
||||
tr
|
||||
each name in names
|
||||
th= name
|
||||
|
||||
|
||||
mixin row(...cells)
|
||||
tr
|
||||
each cell in cells
|
||||
td= cell
|
||||
|
||||
|
||||
mixin LicenseOption(name, period, price, audience)
|
||||
.item
|
||||
h4 #{name}
|
||||
|
||||
.focus #{period}
|
||||
|
||||
span #{price}
|
||||
|
||||
h5 Suggested for:
|
||||
|
||||
span #{audience}
|
||||
|
||||
a.button(href="spacy_trial_free.docx") Download license
|
||||
|
||||
span or
|
||||
a(href="#") get in touch
|
||||
|
||||
|
||||
block body_block
|
||||
article.pricing
|
||||
|
||||
.box.license
|
||||
+LicenseOption("Trial", "90 days", "$0", "Evaluation")
|
||||
+LicenseOption("Production", "1 year", "$5,000", "Production")
|
||||
+LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning")
|
||||
|
||||
p.caption
|
||||
| Researcher, hobbyist, or open-source developer? spaCy also offers
|
||||
a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3
|
||||
| licenses.
|
||||
|
||||
p.
|
||||
What we offer is a rare, simple certainty: a long-term, permissive license
|
||||
that comes with full access to the source, complete transparency, and almost
|
||||
complete flexibility. The difference between this and a black-box API is
|
||||
night and day. You cannot build a great product against a service you
|
||||
don't understand, and you can't build a great business on a service you
|
||||
don't control.
|
||||
|
||||
p
|
||||
| Let's face it: services disappear. Constantly. The good start-ups get
|
||||
| bought; the bad ones go bankrupt. Open-source projects become abandoned
|
||||
| or bloated. Google's graveyard is over-flowing – ditto for Yahoo!,
|
||||
| Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset?
|
||||
|
||||
p
|
||||
| A 5 year license won't expire until 2020. spaCy will be with you for
|
||||
| longer than most of your current staff. If that's still not enough,
|
||||
| get in touch. I'm sure we can work something out.
|
||||
|
||||
//p.
|
||||
// To make spaCy as valuable as possible, licenses to it are for life. You get
|
||||
// complete transparency, certainty and control. If you need to use spaCy
|
||||
// as an API, it's trivial to host it yourself – and you don't need to
|
||||
// worry about the service changing or disappearing. And if you're ever in
|
||||
// acquisition or IPO talks, the story is simple.
|
||||
|
||||
//p.
|
||||
// spaCy can also be used as free open-source software, under the Aferro GPL
|
||||
// license. If you use it this way, you must comply with the AGPL license
|
||||
// terms. When you distribute your project, or offer it as a network service,
|
||||
// you must distribute the source-code and grant users an AGPL license to it.
|
||||
|
||||
|
||||
//h3 Examples
|
||||
|
||||
//p.
|
||||
// In order to clarify how spaCy's license structure might apply to you, I've
|
||||
// written a few examples, in the form of user-stories.
|
||||
|
||||
//details
|
||||
// summary: h4 Seed stage start-ups
|
||||
|
||||
// p.
|
||||
// Ashley and Casey have an idea for a start-up. To explore their idea, they
|
||||
// want to build a minimum viable product they can put in front of potential
|
||||
// users and investors.
|
||||
|
||||
// p. They have two options.
|
||||
|
||||
// ol
|
||||
// li
|
||||
// p.
|
||||
// <strong>Trial commercial license.</strong> With a simple form, they can
|
||||
// use spaCy for 90 days, for a nominal fee of $1. They are free to modify
|
||||
// spaCy, and they will own the copyright to their modifications for the
|
||||
// duration of the license. After the trial period elapses, they can either
|
||||
// pay the license fee, stop using spaCy, release their project under the
|
||||
// AGPL.
|
||||
//
|
||||
// li
|
||||
// p.
|
||||
// <strong>AGPL.</strong> Casey and Pat can instead use spaCy under the AGPL
|
||||
// license. However, they must then release any code that statically or
|
||||
// dynamically links to spaCy under the AGPL as well (e.g. if they import
|
||||
// the module, or import a module that imports it, etc). They also cannot
|
||||
// use spaCy as a network resource, by running it as a service --- this is
|
||||
// the loophole that the "A" part of the AGPL is designed to close.
|
||||
//
|
||||
// p.
|
||||
// Ashley and Casey find the AGPL license unattractive for commercial use.
|
||||
// They decide to take up the trial commercial license. However, over the
|
||||
// next 90 days, Ashley has to move house twice, and Casey gets sick. By
|
||||
// the time the trial expires, they still don't have a demo they can show
|
||||
// investors. They send an email explaining the situation, and a 90 day extension
|
||||
// to their trial license is granted.
|
||||
|
||||
// p.
|
||||
// By the time the extension period has elapsed, spaCy has helped them secure
|
||||
// funding, and they even have a little revenue. They are glad to pay the
|
||||
// $5,000 commercial license fee.
|
||||
|
||||
// p.
|
||||
// spaCy is now permanently licensed for the product Ashley and Casey are
|
||||
// developing. They own the copyright to any modifications they make to spaCy,
|
||||
// but not to the original spaCy code.
|
||||
|
||||
// p.
|
||||
// No additional fees will be due when they hire new developers, run spaCy on
|
||||
// additional internal servers, etc. If their company is acquired, the license
|
||||
// will be transferred to the company acquiring them. However, to use spaCy
|
||||
// in another product, they will have to buy a second license.
|
||||
|
||||
|
||||
// details
|
||||
// summary: h4 University academics
|
||||
|
||||
// p.
|
||||
// Alex and Sasha are post-doctoral researchers working for a university.
|
||||
// Part of their funding comes from a grant from Google, but Google will not
|
||||
// own any part of the work that they produce. Their mission is just to write
|
||||
// papers.
|
||||
|
||||
// p.
|
||||
// Alex and Sasha find spaCy convenient, so they use it in their system under
|
||||
// the AGPL. This means that their system must also be released under the
|
||||
// AGPL, but they're cool with that – they were going to release their
|
||||
// code anyway, as it's the only way to ensure their experiments are properly
|
||||
// repeatable.
|
||||
|
||||
// p.
|
||||
// Alex and Sasha find and fix a few bugs in spaCy. They must release these
|
||||
// modifications, and they ask that they be accepted into the main spaCy repo.
|
||||
// In order to do this, they must sign a contributor agreement, ceding their
|
||||
// copyright. When commercial licenses to spaCy are sold, Alex and Sasha will
|
||||
// not be able to claim any royalties from their contributions.
|
||||
|
||||
// p.
|
||||
// Later, Alex and Sasha implement new features into spaCy, for another paper.
|
||||
// The code was quite rushed, and they don't want to take the time to put
|
||||
// together a proper pull request. They must release their modifications
|
||||
// under the AGPL, but they are not obliged to contribute it to the spaCy
|
||||
// repository, or concede their copyright.
|
||||
|
||||
// details
|
||||
// summary: h4 Open Source developers
|
||||
|
||||
// p.
|
||||
// Phuong and Jessie use the open-source software Calibre to manage their
|
||||
// e-book libraries. They have an idea for a search feature, and they want
|
||||
// to use spaCy to implement it. Calibre is released under the GPLv3. The
|
||||
// AGPL has additional restrictions for projects used as a network resource,
|
||||
// but they don't apply to this project, so Phuong and Jessie can use spaCy
|
||||
// to improve Calibre. They'll have to release their code, but that was
|
||||
// always their intention anyway.
|
19
docs/redesign/mixins.jade
Normal file
19
docs/redesign/mixins.jade
Normal file
|
@ -0,0 +1,19 @@
|
|||
mixin Section(title_text, link_name, include_file)
|
||||
h3: a(name=link_name href=link_name) #{title_text}
|
||||
|
||||
if (link_name == "example-use")
|
||||
include ./usage_examples.jade
|
||||
else if (link_name == "online-demo")
|
||||
include ./online_demo.jade
|
||||
else if (link_name == "comparisons")
|
||||
include ./comparisons.jade
|
||||
else if (link_name == "install")
|
||||
include ./installation.jade
|
||||
else if (link_name == "api")
|
||||
include ./api.jade
|
||||
else if (link_name == "tutorials")
|
||||
include ./tutorials.jade
|
||||
else if (link_name == "spec")
|
||||
include ./spec.jade
|
||||
|
||||
|
|
@ -21,10 +21,10 @@ html(lang="en")
|
|||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="#") Home
|
||||
li: a(href="#") Docs
|
||||
li: a(href="#") License
|
||||
li: a(href="#") More
|
||||
li: a(href="home.html") Home
|
||||
li: a(href="docs.html") Docs
|
||||
li: a(href="license.html") License
|
||||
li: a(href="blog.html") Blog
|
||||
|
||||
main(id="content" role="main")
|
||||
block intro_block
|
||||
|
|
|
@ -1,5 +1,3 @@
|
|||
extends ./outline.jade
|
||||
|
||||
mixin columns(...names)
|
||||
tr
|
||||
each name in names
|
||||
|
@ -12,112 +10,120 @@ mixin row(...cells)
|
|||
td= cell
|
||||
|
||||
|
||||
block body_block
|
||||
article(class="page docs-page")
|
||||
p.
|
||||
This document describes the target annotations spaCy is trained to predict.
|
||||
This is currently a work in progress. Please ask questions on the issue tracker,
|
||||
so that the answers can be integrated here to improve the documentation.
|
||||
details
|
||||
summary: h4 Overview
|
||||
|
||||
h2 Tokenization
|
||||
p.
|
||||
This document describes the target annotations spaCy is trained to predict.
|
||||
This is currently a work in progress. Please ask questions on the issue tracker,
|
||||
so that the answers can be integrated here to improve the documentation.
|
||||
|
||||
p Tokenization standards are based on the OntoNotes 5 corpus.
|
||||
details
|
||||
summary: h4 Tokenization
|
||||
|
||||
p.
|
||||
The tokenizer differs from most by including tokens for significant
|
||||
whitespace. Any sequence of whitespace characters beyond a single space
|
||||
(' ') is included as a token. For instance:
|
||||
p Tokenization standards are based on the OntoNotes 5 corpus.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| nlp = English(parse=False)
|
||||
| tokens = nlp('Some\nspaces and\ttab characters')
|
||||
| print([t.orth_ for t in tokens])
|
||||
p.
|
||||
The tokenizer differs from most by including tokens for significant
|
||||
whitespace. Any sequence of whitespace characters beyond a single space
|
||||
(' ') is included as a token. For instance:
|
||||
|
||||
p Which produces:
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| nlp = English(parse=False)
|
||||
| tokens = nlp('Some\nspaces and\ttab characters')
|
||||
| print([t.orth_ for t in tokens])
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters']
|
||||
p Which produces:
|
||||
|
||||
p.
|
||||
The whitespace tokens are useful for much the same reason punctuation is
|
||||
– it's often an important delimiter in the text. By preserving
|
||||
it in the token output, we are able to maintain a simple alignment
|
||||
between the tokens and the original string, and we ensure that no
|
||||
information is lost during processing.
|
||||
pre.language-python
|
||||
code
|
||||
| ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters']
|
||||
|
||||
h3 Sentence boundary detection
|
||||
p.
|
||||
The whitespace tokens are useful for much the same reason punctuation is
|
||||
– it's often an important delimiter in the text. By preserving
|
||||
it in the token output, we are able to maintain a simple alignment
|
||||
between the tokens and the original string, and we ensure that no
|
||||
information is lost during processing.
|
||||
|
||||
p.
|
||||
Sentence boundaries are calculated from the syntactic parse tree, so
|
||||
features such as punctuation and capitalisation play an important but
|
||||
non-decisive role in determining the sentence boundaries. Usually this
|
||||
means that the sentence boundaries will at least coincide with clause
|
||||
boundaries, even given poorly punctuated text.
|
||||
details
|
||||
summary: h4 Sentence boundary detection
|
||||
|
||||
h3 Part-of-speech Tagging
|
||||
p.
|
||||
Sentence boundaries are calculated from the syntactic parse tree, so
|
||||
features such as punctuation and capitalisation play an important but
|
||||
non-decisive role in determining the sentence boundaries. Usually this
|
||||
means that the sentence boundaries will at least coincide with clause
|
||||
boundaries, even given poorly punctuated text.
|
||||
|
||||
p.
|
||||
The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank
|
||||
tag set. We also map the tags to the simpler Google Universal POS Tag set.
|
||||
details
|
||||
summary: h4 Part-of-speech Tagging
|
||||
|
||||
Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124
|
||||
p.
|
||||
The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank
|
||||
tag set. We also map the tags to the simpler Google Universal POS Tag set.
|
||||
|
||||
h3 Lemmatization
|
||||
p.
|
||||
Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124
|
||||
|
||||
p.
|
||||
A "lemma" is the uninflected form of a word. In English, this means:
|
||||
details
|
||||
summary: h4 Lemmatization
|
||||
|
||||
ul
|
||||
li Adjectives: The form like "happy", not "happier" or "happiest"
|
||||
li Adverbs: The form like "badly", not "worse" or "worst"
|
||||
li Nouns: The form like "dog", not "dogs"; like "child", not "children"
|
||||
li Verbs: The form like "write", not "writes", "writing", "wrote" or "written"
|
||||
p.
|
||||
A "lemma" is the uninflected form of a word. In English, this means:
|
||||
|
||||
p.
|
||||
The lemmatization data is taken from WordNet. However, we also add a
|
||||
special case for pronouns: all pronouns are lemmatized to the special
|
||||
token -PRON-.
|
||||
ul
|
||||
li Adjectives: The form like "happy", not "happier" or "happiest"
|
||||
li Adverbs: The form like "badly", not "worse" or "worst"
|
||||
li Nouns: The form like "dog", not "dogs"; like "child", not "children"
|
||||
li Verbs: The form like "write", not "writes", "writing", "wrote" or "written"
|
||||
|
||||
p.
|
||||
The lemmatization data is taken from WordNet. However, we also add a
|
||||
special case for pronouns: all pronouns are lemmatized to the special
|
||||
token -PRON-.
|
||||
|
||||
|
||||
h3 Syntactic Dependency Parsing
|
||||
details
|
||||
summary: h4 Syntactic Dependency Parsing
|
||||
|
||||
p.
|
||||
The parser is trained on data produced by the ClearNLP converter. Details
|
||||
of the annotation scheme can be found here: http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
|
||||
p.
|
||||
The parser is trained on data produced by the ClearNLP converter. Details
|
||||
of the annotation scheme can be found here: http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
|
||||
|
||||
h3 Named Entity Recognition
|
||||
details
|
||||
summary: h4 Named Entity Recognition
|
||||
|
||||
table
|
||||
thead
|
||||
+columns("Entity Type", "Description")
|
||||
table
|
||||
thead
|
||||
+columns("Entity Type", "Description")
|
||||
|
||||
tbody
|
||||
+row("PERSON", "People, including fictional.")
|
||||
+row("NORP", "Nationalities or religious or political groups.")
|
||||
+row("FACILITY", "Buildings, airports, highways, bridges, etc.")
|
||||
+row("ORG", "Companies, agencies, institutions, etc.")
|
||||
+row("GPE", "Countries, cities, states.")
|
||||
+row("LOC", "Non-GPE locations, mountain ranges, bodies of water.")
|
||||
+row("PRODUCT", "Vehicles, weapons, foods, etc. (Not services")
|
||||
+row("EVENT", "Named hurricanes, battles, wars, sports events, etc.")
|
||||
+row("WORK_OF_ART", "Titles of books, songs, etc.")
|
||||
+row("LAW", "Named documents made into laws")
|
||||
+row("LANGUAGE", "Any named language")
|
||||
tbody
|
||||
+row("PERSON", "People, including fictional.")
|
||||
+row("NORP", "Nationalities or religious or political groups.")
|
||||
+row("FACILITY", "Buildings, airports, highways, bridges, etc.")
|
||||
+row("ORG", "Companies, agencies, institutions, etc.")
|
||||
+row("GPE", "Countries, cities, states.")
|
||||
+row("LOC", "Non-GPE locations, mountain ranges, bodies of water.")
|
||||
+row("PRODUCT", "Vehicles, weapons, foods, etc. (Not services")
|
||||
+row("EVENT", "Named hurricanes, battles, wars, sports events, etc.")
|
||||
+row("WORK_OF_ART", "Titles of books, songs, etc.")
|
||||
+row("LAW", "Named documents made into laws")
|
||||
+row("LANGUAGE", "Any named language")
|
||||
|
||||
p The following values are also annotated in a style similar to names:
|
||||
p The following values are also annotated in a style similar to names:
|
||||
|
||||
table
|
||||
thead
|
||||
+columns("Entity Type", "Description")
|
||||
table
|
||||
thead
|
||||
+columns("Entity Type", "Description")
|
||||
|
||||
tbody
|
||||
+row("DATE", "Absolute or relative dates or periods")
|
||||
+row("TIME", "Times smaller than a day")
|
||||
+row("PERCENT", 'Percentage (including “%”)')
|
||||
+row("MONEY", "Monetary values, including unit")
|
||||
+row("QUANTITY", "Measurements, as of weight or distance")
|
||||
+row("ORDINAL", 'first", "second"')
|
||||
+row("CARDINAL", "Numerals that do not fall under another type")
|
||||
tbody
|
||||
+row("DATE", "Absolute or relative dates or periods")
|
||||
+row("TIME", "Times smaller than a day")
|
||||
+row("PERCENT", 'Percentage (including “%”)')
|
||||
+row("MONEY", "Monetary values, including unit")
|
||||
+row("QUANTITY", "Measurements, as of weight or distance")
|
||||
+row("ORDINAL", 'first", "second"')
|
||||
+row("CARDINAL", "Numerals that do not fall under another type")
|
||||
|
|
31
docs/redesign/template_post.jade
Normal file
31
docs/redesign/template_post.jade
Normal file
|
@ -0,0 +1,31 @@
|
|||
doctype html
|
||||
html(lang='en')
|
||||
head
|
||||
meta(charset='utf-8')
|
||||
title spaCy Blog
|
||||
meta(name='description', content='')
|
||||
meta(name='author', content='Matthew Honnibal')
|
||||
link(rel='stylesheet', href='css/style.css')
|
||||
//if lt IE 9
|
||||
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
|
||||
body#blog(role="document")
|
||||
header(role='banner')
|
||||
h1.logo spaCy Blog
|
||||
.slogan Blog
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="home.html") Home
|
||||
li: a(href="docs.html") Docs
|
||||
li.active: a(href="blog.html") Blog
|
||||
li: a(href="license.html") License
|
||||
|
||||
main#content(role='main')
|
||||
block intro_block
|
||||
|
||||
block body_block
|
||||
|
||||
footer(role='contentinfo')
|
||||
|
||||
script(src="js/prism.js")
|
||||
script(src="js/details_polyfill.js")
|
200
docs/redesign/tute_adverbs.jade
Normal file
200
docs/redesign/tute_adverbs.jade
Normal file
|
@ -0,0 +1,200 @@
|
|||
doctype html
|
||||
html(lang='en')
|
||||
head
|
||||
meta(charset='utf-8')
|
||||
title spaCy Blog
|
||||
meta(name='description', content='')
|
||||
meta(name='author', content='Matthew Honnibal')
|
||||
link(rel='stylesheet', href='css/style.css')
|
||||
//if lt IE 9
|
||||
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
|
||||
body#blog
|
||||
header(role='banner')
|
||||
h1.logo spaCy Blog
|
||||
.slogan Blog
|
||||
main#content(role='main')
|
||||
article.post
|
||||
|
||||
|
||||
:markdown-it
|
||||
# Adverbs
|
||||
|
||||
Let's say you're developing a proofreading tool, or possibly an IDE for
|
||||
writers. You're convinced by Stephen King's advice that `adverbs are
|
||||
not your friend <http://www.brainpickings.org/2013/03/13/stephen-king-on-adverbs/>`_,
|
||||
so you want to **highlight all adverbs**. We'll use one of the examples
|
||||
he finds particularly egregious:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV
|
||||
| >>> # Load the pipeline, and call it with some text.
|
||||
| >>> nlp = spacy.en.English()
|
||||
| >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", tag=True, parse=False)
|
||||
| >>> print u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
|
||||
| u‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
:markdown-it
|
||||
Easy enough --- but the problem is that we've also highlighted "back".
|
||||
While "back" is undoubtedly an adverb, we probably don't want to highlight
|
||||
it. If what we're trying to do is flag dubious stylistic choices, we'll
|
||||
need to refine our logic. It turns out only a certain type of adverb
|
||||
is of interest to us.
|
||||
|
||||
|
||||
:markdown-it
|
||||
There are lots of ways we might do this, depending on just what words
|
||||
we want to flag. The simplest way to exclude adverbs like "back" and
|
||||
"not" is by word frequency: these words are much more common than the
|
||||
prototypical manner adverbs that the style guides are worried about.
|
||||
|
||||
:markdown-it
|
||||
The :py:attr:`Lexeme.prob` and :py:attr:`Token.prob` attribute gives a
|
||||
log probability estimate of the word:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> nlp.vocab[u'back'].prob
|
||||
| -7.403977394104004
|
||||
| >>> nlp.vocab[u'not'].prob
|
||||
| -5.407193660736084
|
||||
| >>> nlp.vocab[u'quietly'].prob
|
||||
| -11.07155704498291
|
||||
|
||||
:markdown-it
|
||||
(The probability estimate is based on counts from a 3 billion word corpus,
|
||||
smoothed using the `Simple Good-Turing`_ method.)
|
||||
|
||||
So we can easily exclude the N most frequent words in English from our
|
||||
adverb marker. Let's try N=1000 for now:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV
|
||||
| >>> nlp = spacy.en.English()
|
||||
| >>> # Find log probability of Nth most frequent word
|
||||
| >>> probs = [lex.prob for lex in nlp.vocab]
|
||||
| >>> probs.sort()
|
||||
| >>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
|
||||
| >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
|
||||
| >>> print u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
|
||||
| ‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
:markdown-it
|
||||
There are lots of other ways we could refine the logic, depending on
|
||||
just what words we want to flag. Let's say we wanted to only flag
|
||||
adverbs that modified words similar to "pleaded". This is easy to do,
|
||||
as spaCy loads a vector-space representation for every word (by default,
|
||||
the vectors produced by `Levy and Goldberg (2014)`_). Naturally, the
|
||||
vector is provided as a numpy array:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> pleaded = tokens[7]
|
||||
| >>> pleaded.repvec.shape
|
||||
| (300,)
|
||||
| >>> pleaded.repvec[:5]
|
||||
| array([ 0.04229792, 0.07459262, 0.00820188, -0.02181299, 0.07519238], dtype=float32)
|
||||
|
||||
:markdown-it
|
||||
We want to sort the words in our vocabulary by their similarity to
|
||||
"pleaded". There are lots of ways to measure the similarity of two
|
||||
vectors. We'll use the cosine metric:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> from numpy import dot
|
||||
| >>> from numpy.linalg import norm
|
||||
|
||||
| >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
| >>> words = [w for w in nlp.vocab if w.has_repvec]
|
||||
| >>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
||||
| >>> words.reverse()
|
||||
| >>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
|
||||
| 1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading
|
||||
| >>> print('50-60', ', '.join(w.orth_ for w in words[50:60]))
|
||||
| 50-60 counselled, bragged, backtracked, caucused, refiled, dueled, mused, dissented, yearned, confesses
|
||||
| >>> print('100-110', ', '.join(w.orth_ for w in words[100:110]))
|
||||
| 100-110 cabled, ducked, sentenced, perjured, absconded, bargained, overstayed, clerked, confided, sympathizes
|
||||
| >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010]))
|
||||
| 1000-1010 scorned, baled, righted, requested, swindled, posited, firebombed, slimed, deferred, sagged
|
||||
| >>> print('50000-50010', ', '.join(w.orth_ for w in words[50000:50010]))
|
||||
| 50000-50010, fb, ford, systems, puck, anglers, ik, tabloid, dirty, rims, artists
|
||||
|
||||
:markdown-it
|
||||
As you can see, the similarity model that these vectors give us is excellent
|
||||
--- we're still getting meaningful results at 1000 words, off a single
|
||||
prototype! The only problem is that the list really contains two clusters of
|
||||
words: one associated with the legal meaning of "pleaded", and one for the more
|
||||
general sense. Sorting out these clusters is an area of active research.
|
||||
|
||||
A simple work-around is to average the vectors of several words, and use that
|
||||
as our target:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested']
|
||||
| >>> say_vector = sum(nlp.vocab[verb].repvec for verb in say_verbs) / len(say_verbs)
|
||||
| >>> words.sort(key=lambda w: cosine(w.repvec * say_vector))
|
||||
| >>> words.reverse()
|
||||
| >>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
|
||||
| 1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired
|
||||
| >>> print('50-60', ', '.join(w.orth_ for w in words[50:60]))
|
||||
| 50-60 flaunted, quarrelled, ingratiated, vouched, agonized, apologised, lunched, joked, chafed, schemed
|
||||
| >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010]))
|
||||
| 1000-1010 hoarded, waded, ensnared, clamoring, abided, deploring, shriveled, endeared, rethought, berate
|
||||
|
||||
:markdown-it
|
||||
These definitely look like words that King might scold a writer for attaching
|
||||
adverbs to. Recall that our original adverb highlighting function looked like
|
||||
this:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV
|
||||
| >>> # Load the pipeline, and call it with some text.
|
||||
| >>> nlp = spacy.en.English()
|
||||
| >>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
|
||||
| tag=True, parse=False)
|
||||
| >>> print(''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens))
|
||||
| ‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
|
||||
:markdown-it
|
||||
We wanted to refine the logic so that only adverbs modifying evocative
|
||||
verbs of communication, like "pleaded", were highlighted. We've now
|
||||
built a vector that represents that type of word, so now we can highlight
|
||||
adverbs based on subtle logic, honing in on adverbs that seem the most
|
||||
stylistically problematic, given our starting assumptions:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> import numpy
|
||||
| >>> from numpy import dot
|
||||
| >>> from numpy.linalg import norm
|
||||
| >>> import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV, VERB
|
||||
| >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
| >>> def is_bad_adverb(token, target_verb, tol):
|
||||
| ... if token.pos != ADV
|
||||
| ... return False
|
||||
| ... elif token.head.pos != VERB:
|
||||
| ... return False
|
||||
| ... elif cosine(token.head.repvec, target_verb) < tol:
|
||||
| ... return False
|
||||
| ... else:
|
||||
| ... return True
|
||||
|
||||
:markdown-it
|
||||
This example was somewhat contrived --- and, truth be told, I've never
|
||||
really bought the idea that adverbs were a grave stylistic sin. But
|
||||
hopefully it got the message across: the state-of-the-art NLP technologies
|
||||
are very powerful. spaCy gives you easy and efficient access to them,
|
||||
which lets you build all sorts of useful products and features that
|
||||
were previously impossible.
|
||||
|
||||
footer(role='contentinfo')
|
||||
script(src='js/prism.js')
|
132
docs/redesign/tute_syntax_search.jade
Normal file
132
docs/redesign/tute_syntax_search.jade
Normal file
|
@ -0,0 +1,132 @@
|
|||
doctype html
|
||||
html(lang='en')
|
||||
head
|
||||
meta(charset='utf-8')
|
||||
title spaCy Blog
|
||||
meta(name='description', content='')
|
||||
meta(name='author', content='Matthew Honnibal')
|
||||
link(rel='stylesheet', href='css/style.css')
|
||||
//if lt IE 9
|
||||
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
|
||||
body#blog
|
||||
header(role='banner')
|
||||
h1.logo spaCy Blog
|
||||
.slogan Blog
|
||||
main#content(role='main')
|
||||
section.intro
|
||||
p
|
||||
| Example use of the spaCy NLP tools for data exploration.
|
||||
| Here we will look for reddit comments that describe Google doing something,
|
||||
| i.e. discuss the company's actions. This is difficult, because other senses of
|
||||
| "Google" now dominate usage of the word in conversation, particularly references to
|
||||
| using Google products.
|
||||
|
||||
p
|
||||
| The heuristics used are quick and dirty – about 5 minutes work.
|
||||
|
||||
//| A better approach is to use the word vector of the verb. But, the
|
||||
// | demo here is just to show what's possible to build up quickly, to
|
||||
// | start to understand some data.
|
||||
|
||||
article.post
|
||||
header
|
||||
h2 Syntax-specific Search
|
||||
.subhead
|
||||
| by
|
||||
a(href='#', rel='author') Matthew Honnibal
|
||||
| on
|
||||
time(datetime='2015-08-14') August
|
||||
|
||||
details
|
||||
summary: h4 Imports
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from __future__ import unicode_literals
|
||||
| from __future__ import print_function
|
||||
| import sys
|
||||
|
|
||||
| import plac
|
||||
| import bz2
|
||||
| import ujson
|
||||
| import spacy.en
|
||||
|
||||
details
|
||||
summary: h4 Load the model and iterate over the data
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def main(input_loc):
|
||||
| nlp = spacy.en.English() # Load the model takes 10-20 seconds.
|
||||
| for line in bz2.BZ2File(input_loc): # Iterate over the reddit comments from the dump.
|
||||
| comment_str = ujson.loads(line)['body'] # Parse the json object, and extract the 'body' attribute.
|
||||
|
|
||||
details
|
||||
summary: h4 Apply the spaCy NLP pipeline, and look for the cases we want
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| comment_parse = nlp(comment_str)
|
||||
| for word in comment_parse:
|
||||
| if google_doing_something(word):
|
||||
| # Print the clause
|
||||
| print(''.join(w.string for w in word.head.subtree).strip())
|
||||
details
|
||||
summary: h4 Define the filter function
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
|
||||
|
|
||||
| def google_doing_something(w):
|
||||
| if w.lower_ != 'google':
|
||||
| return False
|
||||
| # Is it the subject of a verb?
|
||||
| elif w.dep_ != 'nsubj':
|
||||
| return False
|
||||
| # And not 'is'
|
||||
| elif w.head.lemma_ == 'be' and w.head.dep_ != 'aux':
|
||||
| return False
|
||||
| # Exclude e.g. "Google says..."
|
||||
| elif w.head.lemma_ in ('say', 'show'):
|
||||
| return False
|
||||
| else:
|
||||
| return True
|
||||
|
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 Call main
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| if __name__ == '__main__':
|
||||
| plac.call(main)
|
||||
|
||||
details
|
||||
summary: h4 Example output
|
||||
|
||||
p.
|
||||
Many false positives remain. Some are from incorrect interpretations
|
||||
of the sentence by spaCy, some are flaws in our filtering logic. But
|
||||
the results are vastly better than a string-based search, which returns
|
||||
almost no examples of the pattern we're looking for.
|
||||
|
||||
code
|
||||
| Google dropped support for Android < 4.0 already
|
||||
| google drive
|
||||
| Google to enforce a little more uniformity in its hardware so that we can see a better 3rd party market for things like mounts, cases, etc
|
||||
| When Google responds
|
||||
| Google translate cyka pasterino.
|
||||
| A quick google looks like Synology does have a sync'ing feature which does support block level so that should work
|
||||
| (google came up with some weird One Piece/FairyTail crossover stuff), and is their knowledge universally infallible?
|
||||
| Until you have the gear, google some videos on best farming runs on each planet, you can get a lot REAL fast with the right loop.
|
||||
| Google offers something like this already, but it is truly terrible.
|
||||
| google isn't helping me
|
||||
| Google tells me: 0 results, 250 pages removed from google.
|
||||
| how did Google swoop in and eat our lunch
|
||||
|
||||
|
||||
|
||||
script(src="js/prism.js")
|
||||
script(src="js/details_polyfill.js")
|
204
docs/redesign/tute_twitter.jade
Normal file
204
docs/redesign/tute_twitter.jade
Normal file
|
@ -0,0 +1,204 @@
|
|||
doctype html
|
||||
html(lang='en')
|
||||
head
|
||||
meta(charset='utf-8')
|
||||
title spaCy Blog
|
||||
meta(name='description', content='')
|
||||
meta(name='author', content='Matthew Honnibal')
|
||||
link(rel='stylesheet', href='css/style.css')
|
||||
//if lt IE 9
|
||||
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
|
||||
body#blog
|
||||
header(role='banner')
|
||||
h1.logo spaCy Blog
|
||||
.slogan Blog
|
||||
main#content(role='main')
|
||||
article.post
|
||||
header
|
||||
h2 Finding Relevant Tweets
|
||||
.subhead
|
||||
| by
|
||||
a(href='#', rel='author') Matthew Honnibal
|
||||
| on
|
||||
time(datetime='2015-08-14') December
|
||||
|
||||
details
|
||||
summary: h4 Imports
|
||||
pre.language-python
|
||||
|
||||
| from __future__ import unicode_literals, print_function
|
||||
| import plac
|
||||
| import codecs
|
||||
| import sys
|
||||
| import math
|
||||
|
|
||||
| import spacy.en
|
||||
| from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ
|
||||
|
|
||||
| from termcolor import colored
|
||||
| from twython import TwythonStreamer
|
||||
|
|
||||
| from os import path
|
||||
| from math import sqrt
|
||||
|
|
||||
| from numpy import dot
|
||||
| from numpy.linalg import norm
|
||||
|
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 Simple vector-averaging similarity
|
||||
|
||||
pre.language-python: code
|
||||
|
||||
| class Meaning(object):
|
||||
| def __init__(self, vectors):
|
||||
| if vectors:
|
||||
| self.vector = sum(vectors) / len(vectors)
|
||||
| self.norm = norm(self.vector)
|
||||
| else:
|
||||
| self.vector = None
|
||||
| self.norm = 0
|
||||
|
|
||||
| @classmethod
|
||||
| def from_path(cls, nlp, loc):
|
||||
| with codecs.open(loc, 'r', 'utf8') as file_:
|
||||
| terms = file_.read().strip().split()
|
||||
| return cls.from_terms(nlp, terms)
|
||||
|
|
||||
| @classmethod
|
||||
| def from_tokens(cls, nlp, tokens):
|
||||
| vectors = [t.repvec for t in tokens]
|
||||
| return cls(vectors)
|
||||
|
|
||||
| @classmethod
|
||||
| def from_terms(cls, nlp, examples):
|
||||
| lexemes = [nlp.vocab[eg] for eg in examples]
|
||||
| vectors = [eg.repvec for eg in lexemes]
|
||||
| return cls(vectors)
|
||||
|
|
||||
| def similarity(self, other):
|
||||
| if not self.norm or not other.norm:
|
||||
| return -1
|
||||
| return dot(self.vector, other.vector) / (self.norm * other.norm)
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 Print matches
|
||||
|
||||
pre.language-python: code
|
||||
|
||||
|
|
||||
| def print_colored(model, stream=sys.stdout):
|
||||
| if model['is_match']:
|
||||
| color = 'green'
|
||||
| elif model['is_reject']:
|
||||
| color = 'red'
|
||||
| else:
|
||||
| color = 'grey'
|
||||
|
|
||||
| if not model['is_rare'] and model['is_match'] and not model['is_reject']:
|
||||
| match_score = colored('%.3f' % model['match_score'], 'green')
|
||||
| reject_score = colored('%.3f' % model['reject_score'], 'red')
|
||||
| prob = '%.5f' % model['prob']
|
||||
|
|
||||
| print(match_score, reject_score, prob)
|
||||
| print(repr(model['text']), color)
|
||||
| print('')
|
||||
|
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 TextMatcher: Process the tweets using spaCy
|
||||
|
||||
pre.language-python: code
|
||||
|
||||
| class TextMatcher(object):
|
||||
| def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject):
|
||||
| self.nlp = nlp
|
||||
| self.get_target = get_target
|
||||
| self.get_reject = get_reject
|
||||
| self.min_prob = min_prob
|
||||
| self.min_match = min_match
|
||||
| self.max_reject = max_reject
|
||||
|
|
||||
| def __call__(self, text):
|
||||
| tweet = self.nlp(text)
|
||||
| target_terms = self.get_target()
|
||||
| reject_terms = self.get_reject()
|
||||
|
|
||||
| prob = sum(math.exp(w.prob) for w in tweet) / len(tweet)
|
||||
| meaning = Meaning.from_tokens(self, tweet)
|
||||
|
|
||||
| match_score = meaning.similarity(self.get_target())
|
||||
| reject_score = meaning.similarity(self.get_reject())
|
||||
| return {
|
||||
| 'text': tweet.string,
|
||||
| 'prob': prob,
|
||||
| 'match_score': match_score,
|
||||
| 'reject_score': reject_score,
|
||||
| 'is_rare': prob < self.min_prob,
|
||||
| 'is_match': prob >= self.min_prob and match_score >= self.min_match,
|
||||
| 'is_reject': prob >= self.min_prob and reject_score >= self.max_reject
|
||||
| }
|
||||
|
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 Connect to Twitter and stream tweets
|
||||
|
||||
pre.language-python: code
|
||||
|
||||
| class Connection(TwythonStreamer):
|
||||
| def __init__(self, keys_dir, handler, view):
|
||||
| keys = Secrets(keys_dir)
|
||||
| TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret)
|
||||
| self.handler = handler
|
||||
| self.view = view
|
||||
|
|
||||
| def on_success(self, data):
|
||||
| text = data.get('text', u'')
|
||||
| # Twython returns either bytes or unicode, depending on tweet.
|
||||
| # #APIshaming
|
||||
| try:
|
||||
| model = self.handler(text)
|
||||
| except TypeError:
|
||||
| model = self.handler(text.decode('utf8'))
|
||||
| status = self.view(model, sys.stdin)
|
||||
|
|
||||
| def on_error(self, status_code, data):
|
||||
| print(status_code)
|
||||
|
|
||||
|
|
||||
| class Secrets(object):
|
||||
| def __init__(self, key_dir):
|
||||
| self.key = open(path.join(key_dir, 'key.txt')).read().strip()
|
||||
| self.secret = open(path.join(key_dir, 'secret.txt')).read().strip()
|
||||
| self.token = open(path.join(key_dir, 'token.txt')).read().strip()
|
||||
| self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip()
|
||||
|
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 Command-line interface
|
||||
|
||||
pre.language-python: code
|
||||
|
||||
| def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5):
|
||||
| # We don't need the parser for this demo, so may as well save the loading time
|
||||
| nlp = spacy.en.English(Parser=None)
|
||||
| get_target = lambda: Meaning.from_path(nlp, target_loc)
|
||||
| get_reject = lambda: Meaning.from_path(nlp, reject_loc)
|
||||
| matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject)
|
||||
|
|
||||
| twitter = Connection(keys_dir, matcher, print_colored)
|
||||
| twitter.statuses.filter(track=term)
|
||||
|
|
||||
|
|
||||
| if __name__ == '__main__':
|
||||
| plac.call(main)
|
||||
|
|
||||
|
||||
footer(role='contentinfo')
|
||||
script(src='js/prism.js')
|
||||
|
0
docs/redesign/tutorials.jade
Normal file
0
docs/redesign/tutorials.jade
Normal file
|
@ -106,4 +106,11 @@ mixin example(name)
|
|||
|
||||
+example("Efficient binary serialization")
|
||||
pre.language-python: code
|
||||
|
|
||||
|
||||
| byte_string = doc.as_bytes()
|
||||
| open('/tmp/moby_dick.bin', 'wb').write(byte_string)
|
||||
|
||||
| nlp = spacy.en.English()
|
||||
| for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')):
|
||||
| doc = Doc(nlp.vocab)
|
||||
| doc.from_bytes(byte_string)
|
||||
|
|
Loading…
Reference in New Issue
Block a user