mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-13 07:55:49 +03:00
* Remove old docs
This commit is contained in:
parent
cad0cca4e3
commit
890d6aa216
|
@ -1,661 +0,0 @@
|
|||
mixin declare_class(name)
|
||||
details
|
||||
summary
|
||||
span.declaration
|
||||
span.label class
|
||||
code #{name}
|
||||
block
|
||||
|
||||
mixin method(name, parameters)
|
||||
details(open=attributes.open)
|
||||
summary
|
||||
span.declaration
|
||||
span.label #{name}
|
||||
span.parameters
|
||||
| self, #{parameters}
|
||||
block
|
||||
|
||||
|
||||
mixin params
|
||||
ul
|
||||
block
|
||||
|
||||
|
||||
mixin param(name, type, value)
|
||||
li
|
||||
if type
|
||||
<strong>#{name}</strong> (!{type}) –
|
||||
else
|
||||
<strong>#{name}</strong> –
|
||||
block
|
||||
|
||||
|
||||
mixin attribute(name, type, value)
|
||||
details(open=attributes.open)
|
||||
summary
|
||||
span.declaration
|
||||
span.label #{name}
|
||||
block
|
||||
|
||||
|
||||
mixin returns(name, type, value)
|
||||
li
|
||||
if type
|
||||
<strong>#{name}</strong> (!{type}) –
|
||||
else
|
||||
<strong>#{name}</strong> –
|
||||
block
|
||||
|
||||
|
||||
mixin returns(type)
|
||||
| tmp
|
||||
|
||||
mixin init
|
||||
details
|
||||
summary: h4 Init
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin callable
|
||||
details
|
||||
summary: h4 Callable
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin sequence
|
||||
details
|
||||
summary: h4 Sequence
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin maptype
|
||||
details
|
||||
summary: h4 Map
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin summary
|
||||
block
|
||||
|
||||
mixin en_example
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| from spacy._doc_examples import download_war_and_peace
|
||||
|
|
||||
| unprocessed_unicode = download_war_and_peace()
|
||||
|
|
||||
| nlp = English()
|
||||
| doc = nlp(unprocessed_unicode)
|
||||
|
||||
|
||||
+declare_class("English")
|
||||
p Load models into a callable object to process English text.
|
||||
|
||||
+summary
|
||||
+en_example
|
||||
|
||||
+init
|
||||
p
|
||||
| Load the resources. Loading takes 20 seconds, and the instance
|
||||
| consumes 2 to 3 gigabytes of memory.
|
||||
|
||||
p
|
||||
| Intended use is for one instance to be created per process.
|
||||
| You can create more if you're doing something unusual.
|
||||
p
|
||||
| You may wish to make the instance a global variable or "singleton".
|
||||
| We usually instantiate the object in the <code>main()</code>
|
||||
| function and pass it around as an explicit argument.
|
||||
+method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
|
||||
|
||||
+params
|
||||
+param("data_dir")
|
||||
| The data directory. May be #{None}, to disable any data loading
|
||||
| (including the vocabulary).
|
||||
|
||||
+param("Tokenizer")
|
||||
| A class/function that creates the tokenizer.
|
||||
|
||||
+param("Tagger")
|
||||
| A class/function that creates the part-of-speech tagger.
|
||||
|
||||
+param("Parser")
|
||||
| A class/function that creates the dependency parser.
|
||||
|
||||
+param("Entity")
|
||||
| A class/function that creates the named entity recogniser.
|
||||
|
||||
+param("load_vectors")
|
||||
| A boolean value to control whether the word vectors are loaded.
|
||||
|
||||
+callable
|
||||
+method("__call__", "text, tag=True, parse=True, entity=True")
|
||||
|
||||
+params
|
||||
+param("text", types.unicode)
|
||||
| The text to be processed. No pre-processing needs to be applied,
|
||||
| and any length of text can be submitted. Usually you will submit
|
||||
| a whole document. Text may be zero-length. An exception is raised
|
||||
| if byte strings are supplied.
|
||||
|
||||
+param("tag", types.bool)
|
||||
| Whether to apply the part-of-speech tagger. Required for parsing
|
||||
| and entity recognition.
|
||||
|
||||
+param("parse", types.bool)
|
||||
| Whether to apply the syntactic dependency parser.
|
||||
|
||||
+param("entity", types.bool)
|
||||
| Whether to apply the named entity recognizer.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| nlp = English()
|
||||
| doc = nlp(u'Some text.) # Applies tagger, parser, entity
|
||||
| doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
|
||||
| doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
|
||||
| doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
|
||||
| doc = nlp(u'') # Zero-length tokens, not an error
|
||||
| # doc = nlp(b'Some text') <-- Error: need unicode
|
||||
| doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
||||
|
||||
|
||||
+declare_class("Doc")
|
||||
p I'm a doc
|
||||
|
||||
+init
|
||||
+method("__init__", "vocab")
|
||||
+params
|
||||
+param("vocab", vocab_type)
|
||||
| A vocabulary object
|
||||
|
||||
+sequence
|
||||
+method("__getitem__", "i", types.int)
|
||||
+returns(types.Token)
|
||||
|
||||
+method("__getitem__", "start_end", types.slice)
|
||||
+returns(types.Span)
|
||||
|
||||
+method("__iter__")
|
||||
| Iterate over tokens
|
||||
|
||||
+method("__len__")
|
||||
| Number of tokens in the document.
|
||||
|
||||
details
|
||||
summary: h4 Spans
|
||||
|
||||
+attribute("sents", types.generator)
|
||||
| Iterate over sentences in the document.
|
||||
|
||||
+attribute("ents", types.generator)
|
||||
| Iterate over named entities in the document.
|
||||
|
||||
+attribute("noun_chunks", types.generator)
|
||||
|
||||
details
|
||||
summary: h4 Export/Import
|
||||
|
||||
+method("to_array", "attr_ids")
|
||||
|
||||
| Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
||||
| of shape N*M, where N is the length of the sentence.
|
||||
|
||||
+params
|
||||
+param("attr_ids", "list[int]")
|
||||
| A list of attribute ID ints.
|
||||
|
||||
+returns("feat_array")
|
||||
| A feature matrix, with one row per word, and one column per attribute
|
||||
| indicated in the input attr_ids.
|
||||
|
||||
+method("count_by", "attr_id")
|
||||
| Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||
| by the values of the given attribute ID.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> from spacy.en import English, attrs
|
||||
| >>> nlp = English()
|
||||
| >>> tokens = nlp(u'apple apple orange banana')
|
||||
| >>> tokens.count_by(attrs.ORTH)
|
||||
| {12800L: 1, 11880L: 2, 7561L: 1}
|
||||
| >>> tokens.to_array([attrs.ORTH])
|
||||
| array([[11880],
|
||||
| [11880],
|
||||
| [7561],
|
||||
| [12800]])
|
||||
|
||||
+method("from_array", "attrs, array")
|
||||
| Load from array
|
||||
|
||||
+method("from_bytes")
|
||||
| Deserialize, loading from bytes
|
||||
|
||||
+method("read_bytes")
|
||||
| classmethod
|
||||
|
||||
//+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
|
||||
|
||||
// | Merge a multi-word expression into a single token. Currently
|
||||
// | experimental; API is likely to change.
|
||||
|
||||
|
||||
+declare_class("Token")
|
||||
+init
|
||||
+method("__init__", "vocab, doc, offset")
|
||||
+params
|
||||
+param("vocab", types.Vocab)
|
||||
p A Vocab object
|
||||
|
||||
+param("doc", types.Doc)
|
||||
p The parent sequence
|
||||
|
||||
+param("offset", types.int)
|
||||
p The index of the token within the document
|
||||
|
||||
details
|
||||
summary: h4 String Views
|
||||
|
||||
+attribute("orth / orth_")
|
||||
| The form of the word with no string normalization or processing, as
|
||||
| it appears in the string, without trailing whitespace.
|
||||
|
||||
+attribute("lemma / lemma_")
|
||||
| The "base" of the word, with no inflectional suffixes, e.g. the lemma of
|
||||
| "developing" is "develop", the lemma of "geese" is "goose", etc. Note that
|
||||
| <em>derivational</em> suffixes are not stripped, e.g. the lemma of
|
||||
| "instutitions" is "institution", not "institute". Lemmatization is
|
||||
| performed using the WordNet data, but extended to also cover closed-class
|
||||
| words such as pronouns. By default, the WN lemmatizer returns "hi"
|
||||
| as the lemma of "his". We assign pronouns the lemma -PRON-.
|
||||
|
||||
+attribute("lower / lower_")
|
||||
| The form of the word, but forced to lower-case, i.e.
|
||||
pre.language-python: code lower = word.orth\_.lower()
|
||||
|
||||
//+attribute("norm / norm_")
|
||||
// | The form of the word, after language-specific normalizations has been
|
||||
// | applied.
|
||||
|
||||
+attribute("shape / shape_")
|
||||
| A transform of the word's string, to show orthographic features.
|
||||
| The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped
|
||||
| to d. After these mappings, sequences of 4 or more of the same character
|
||||
| are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
|
||||
| :) --> :)
|
||||
|
||||
+attribute("prefix / prefix_")
|
||||
| A length-N substring from the start of the word. Length may vary by
|
||||
| language; currently for English n=1, i.e.
|
||||
pre.language-python: code prefix = word.orth\_[:1]
|
||||
|
||||
+attribute("suffix / suffix_")
|
||||
| A length-N substring from the end of the word. Length may vary by
|
||||
| language; currently for English n=3, i.e.
|
||||
pre.language-python: code suffix = word.orth\_[-3:]
|
||||
|
||||
//+attribute("lex_id")
|
||||
// | lex_id
|
||||
|
||||
details
|
||||
summary: h4 Alignment and Output
|
||||
|
||||
+attribute("idx")
|
||||
p Start index of the token in the string
|
||||
|
||||
+method("__len__", "")
|
||||
p Length of the token's orth string, in unicode code-points.
|
||||
|
||||
+method("__unicode__", "")
|
||||
p Same as token.orth_
|
||||
|
||||
+method("__str__", "")
|
||||
p Varies between Python 2 and Python 3
|
||||
|
||||
+attribute("string")
|
||||
p
|
||||
| The form of the word as it appears in the string, <strong>including
|
||||
| trailing whitespace</strong>. This is useful when you need to use
|
||||
| linguistic features to add inline mark-up to the string.
|
||||
|
||||
+method("nbor, i=1")
|
||||
+params
|
||||
+param("i")
|
||||
p Offset relative to token
|
||||
|
||||
details
|
||||
summary: h4 Distributional Features
|
||||
|
||||
+attribute("repvec")
|
||||
p
|
||||
| A "word embedding" representation: a dense real-valued vector that supports
|
||||
| similarity queries between words. By default, spaCy currently loads
|
||||
| vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
|
||||
| model.
|
||||
|
||||
+attribute("cluster")
|
||||
p
|
||||
| The Brown cluster ID of the word. These are often useful features for
|
||||
| linear models. If you're using a non-linear model, particularly a
|
||||
| neural net or random forest, consider using the real-valued word
|
||||
| representation vector, in Token.repvec, instead.
|
||||
|
||||
+attribute("prob")
|
||||
p
|
||||
| The unigram log-probability of the word, estimated from counts from a
|
||||
| large corpus, smoothed using Simple Good Turing estimation.
|
||||
|
||||
details
|
||||
summary: h4 Syntactic Tags
|
||||
|
||||
+attribute("pos / pos_")
|
||||
p
|
||||
| A part-of-speech tag, from the Google Universal Tag Set, e.g.
|
||||
| code>NOUN</code>, <code>VERB</code>, <code>ADV</code>. Constants for
|
||||
| the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
|
||||
|
||||
+attribute("tag / tag_")
|
||||
p
|
||||
| A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
|
||||
| <code>DT</code>, etc. These tags are language/corpus specific, and
|
||||
| typically describe part-of-speech and some amount of morphological
|
||||
| information. For instance, in the Penn Treebank tag set, <code>VBZ</code>
|
||||
| is assigned to a present-tense singular verb.
|
||||
|
||||
+attribute("dep / dep_")
|
||||
p
|
||||
| The type of syntactic dependency relation between the word and its
|
||||
| syntactic head.
|
||||
|
||||
details
|
||||
summary: h4 Navigating the Parse Tree
|
||||
|
||||
+attribute("head")
|
||||
p
|
||||
| The Token that is the immediate syntactic head of the word. If the
|
||||
| word is the root of the dependency tree, the same word is returned.
|
||||
|
||||
+attribute("lefts")
|
||||
p
|
||||
| An iterator for the immediate leftward syntactic children of the
|
||||
| word.
|
||||
|
||||
+attribute("rights")
|
||||
p
|
||||
| An iterator for the immediate rightward syntactic children of the
|
||||
| word.
|
||||
|
||||
+attribute("n_lefts")
|
||||
p
|
||||
| The number of immediate syntactic children preceding the word in
|
||||
| the string.
|
||||
|
||||
+attribute("n_rights")
|
||||
p
|
||||
| The number of immediate syntactic children following the word in
|
||||
| the string.
|
||||
|
||||
+attribute("children")
|
||||
p
|
||||
| An iterator that yields from lefts, and then yields from rights.
|
||||
|
||||
+attribute("subtree")
|
||||
p
|
||||
| An iterator for the part of the sentence syntactically governed by
|
||||
| the word, including the word itself.
|
||||
|
||||
+attribute("left_edge")
|
||||
p The leftmost edge of the token's subtree
|
||||
|
||||
+attribute("right_edge")
|
||||
p The rightmost edge of the token's subtree
|
||||
|
||||
details
|
||||
summary: h4 Named Entities
|
||||
|
||||
+attribute("ent_type")
|
||||
p If the token is part of an entity, its entity type.
|
||||
|
||||
+attribute("ent_iob")
|
||||
p The IOB (inside, outside, begin) entity recognition tag for the token.
|
||||
|
||||
details
|
||||
summary: h4 Lexeme Flags
|
||||
|
||||
+method("check_flag", "flag_id")
|
||||
+params
|
||||
+param("flag_id")
|
||||
| flag ID
|
||||
|
||||
+attribute("is_oov")
|
||||
+attribute("is_alpha")
|
||||
+attribute("is_ascii")
|
||||
+attribute("is_digit")
|
||||
+attribute("is_lower")
|
||||
+attribute("is_title")
|
||||
+attribute("is_punct")
|
||||
+attribute("is_space")
|
||||
+attribute("like_url")
|
||||
+attribute("like_num")
|
||||
+attribute("like_email")
|
||||
|
||||
//+attribute("conjuncts")
|
||||
// | Conjuncts
|
||||
|
||||
+declare_class("Span")
|
||||
+init
|
||||
+method("__init__")
|
||||
Temp
|
||||
|
||||
<code>span = doc[0:4]</code>
|
||||
|
||||
+sequence
|
||||
+method("__getitem__")
|
||||
p Get item
|
||||
|
||||
+method("__iter__")
|
||||
p Iter
|
||||
|
||||
+method("__len__")
|
||||
p Len
|
||||
|
||||
details
|
||||
summary: h4 Parse
|
||||
|
||||
+attribute("root")
|
||||
p Syntactic head
|
||||
|
||||
+attribute("lefts")
|
||||
p Tokens that are:
|
||||
ol
|
||||
li To the left of the span;
|
||||
li Syntactic children of words within the span
|
||||
|
||||
p i.e.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| lefts = [span.doc[i] for i in range(0, span.start)
|
||||
| if span.doc[i].head in span]
|
||||
|
||||
+attribute("rights")
|
||||
p Tokens that are:
|
||||
ol
|
||||
li To the right of the span;
|
||||
li Syntactic children of words within the span
|
||||
p i.e.
|
||||
pre.language-python
|
||||
code
|
||||
| rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
||||
| if span.doc[i].head in span]
|
||||
|
||||
|
||||
+attribute("subtree")
|
||||
p String
|
||||
|
||||
details
|
||||
summary: h4 String Views
|
||||
|
||||
+attribute("string")
|
||||
p String
|
||||
|
||||
+attribute("lemma / lemma_")
|
||||
p String
|
||||
|
||||
+attribute("label / label_")
|
||||
p String
|
||||
|
||||
+declare_class("Lexeme")
|
||||
p
|
||||
| The Lexeme object represents a lexical type, stored in the vocabulary
|
||||
| – as opposed to a token, occurring in a document.
|
||||
p
|
||||
| Lexemes store various features, so that these features can be computed
|
||||
| once per type, rather than once per token. As job sizes grow, this
|
||||
| can amount to a substantial efficiency improvement.
|
||||
|
||||
p
|
||||
| All Lexeme attributes are therefore context independent, as a single
|
||||
| lexeme is reused for all usages of that word. Lexemes are keyed by
|
||||
| the “orth” attribute.
|
||||
|
||||
p
|
||||
All Lexeme attributes are accessible directly on the Token object.
|
||||
|
||||
+init
|
||||
+method("__init__")
|
||||
p Init
|
||||
|
||||
details
|
||||
summary: h4 String Features
|
||||
|
||||
+attribute("orth / orth_")
|
||||
p
|
||||
| The form of the word with no string normalization or processing,
|
||||
| as it appears in the string, without trailing whitespace.
|
||||
|
||||
+attribute("lower / lower_")
|
||||
p Tmp
|
||||
|
||||
+attribute("norm / norm_")
|
||||
p Tmp
|
||||
|
||||
+attribute("shape / shape_")
|
||||
p Tmp
|
||||
|
||||
+attribute("prefix / prefix_")
|
||||
p Tmp
|
||||
|
||||
+attribute("suffix / suffix_")
|
||||
p TMP
|
||||
|
||||
+declare_class("Vocab", "data_dir=None, lex_props_getter=None")
|
||||
+sequence
|
||||
+method("__len__")
|
||||
+returns
|
||||
p Number of words in the vocabulary.
|
||||
|
||||
+method("__iter__")
|
||||
+returns
|
||||
p Lexeme
|
||||
|
||||
+maptype
|
||||
+method("__getitem__", "key_int")
|
||||
+params
|
||||
+param("key")
|
||||
p Integer ID
|
||||
|
||||
+returns: p A Lexeme object
|
||||
|
||||
+method("__getitem__", "key_str")
|
||||
+params
|
||||
+param("key_str", types.unicode)
|
||||
p A string in the vocabulary
|
||||
|
||||
+returns("Lexeme")
|
||||
|
||||
+method("__setitem__", "orth_str", "props")
|
||||
+params
|
||||
+param("orth_str", types.unicode)
|
||||
p The orth key
|
||||
|
||||
+param("props", types.dict)
|
||||
p A props dictionary
|
||||
|
||||
+returns("None")
|
||||
|
||||
details
|
||||
summary: h4 Import/Export
|
||||
|
||||
+method("dump", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path where the vocabulary should be saved
|
||||
|
||||
+method("load_lexemes", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path to load the lexemes.bin file from
|
||||
|
||||
+method("load_vectors", "loc")
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
p Path to load the vectors.bin from
|
||||
|
||||
+declare_class("StringStore")
|
||||
+init
|
||||
Tmp
|
||||
|
||||
+sequence
|
||||
+method("__len__")
|
||||
+returns("int")
|
||||
p Number of strings in the string-store
|
||||
|
||||
+method("__iter__")
|
||||
+returns
|
||||
p Lexeme
|
||||
|
||||
+maptype
|
||||
+method("__getitem__", "key_int")
|
||||
+params
|
||||
+param("key_int")
|
||||
p An integer key
|
||||
|
||||
+returns(types.unicode)
|
||||
p The string that the integer key maps to
|
||||
|
||||
+method("__getitem__", "key_unicode")
|
||||
+params
|
||||
+param("key_unicode")
|
||||
p A key, as a unicode string
|
||||
|
||||
+returns(types.int)
|
||||
p The integer ID of the string.
|
||||
|
||||
+method("__getitem__", "key_utf8_bytes")
|
||||
+params
|
||||
+param("key_utf8_bytes", types.bytes)
|
||||
p p A key, as a UTF-8 encoded byte-string
|
||||
|
||||
+returns(types.int)
|
||||
p The integer ID of the string.
|
||||
|
||||
details
|
||||
summary: h4 Import/Export
|
||||
|
||||
+method("dump", "loc")
|
||||
+params
|
||||
+param("loc")
|
||||
p File path to save the strings.txt to.
|
||||
|
||||
+method("load")
|
||||
+params
|
||||
+param("loc")
|
||||
p File path to load the strings.txt from.
|
|
@ -1,95 +0,0 @@
|
|||
mixin Teaser(title, url, date_long, date_short, author, lede)
|
||||
article.post
|
||||
header
|
||||
h2
|
||||
a(href=url)= title
|
||||
.subhead
|
||||
| by
|
||||
a(href='#', rel='author')= author
|
||||
| on
|
||||
time(datetime=date_short)= date_long
|
||||
p!= lede
|
||||
|
||||
a.readmore(href='#') ►
|
||||
|
||||
|
||||
|
||||
doctype html
|
||||
html(lang='en')
|
||||
head
|
||||
meta(charset='utf-8')
|
||||
title spaCy Blog
|
||||
meta(name='description', content='')
|
||||
meta(name='author', content='Matthew Honnibal')
|
||||
link(rel='stylesheet', href='css/style.css')
|
||||
//if lt IE 9
|
||||
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
|
||||
body#blog
|
||||
header(role='banner')
|
||||
h1.logo spaCy Blog
|
||||
.slogan Blog
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="home.html") Home
|
||||
li: a(href="docs.html") Docs
|
||||
li.active: a(href="blog.html") Blog
|
||||
li: a(href="license.html") License
|
||||
|
||||
main#content(role='main')
|
||||
section.intro.profile
|
||||
p
|
||||
img(src='img/matt.png')
|
||||
| Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore.
|
||||
span.social
|
||||
a(href='#') Follow me on Twitter
|
||||
nav(role='navigation')
|
||||
ul
|
||||
li
|
||||
a.button(href='#') Blog
|
||||
li
|
||||
a.button(href='#tutorials') Tutorials
|
||||
section.blogs
|
||||
+Teaser(
|
||||
"Introducing spaCy",
|
||||
"blog_intro.html",
|
||||
"February 2015",
|
||||
"2015-02-18",
|
||||
"Matthew Honnibal",
|
||||
"<strong>spaCy</strong> is a new library for text processing in Python " +
|
||||
"and Cython. I wrote it because I think small companies are terrible at " +
|
||||
"natural language processing (NLP). Or rather: small companies are using " +
|
||||
"terrible NLP technology."
|
||||
)
|
||||
|
||||
+Teaser(
|
||||
"Parsing English with 500 lines of Python",
|
||||
"blog_parser.html",
|
||||
"December 18, 2013",
|
||||
"2013-12-18",
|
||||
"Matthew Hannibal",
|
||||
"The Natural Language Processing (NLP) community has made big progress" +
|
||||
"in syntactic parsing over the last few years. It’s now possible for a" +
|
||||
"tiny Python implementation to perform better than the widely-used Stanford " +
|
||||
"PCFG parser.")
|
||||
+Teaser(
|
||||
"A good Part-of-Speech tagger in about 200 lines of Python",
|
||||
"blog_tagger.html",
|
||||
"October 11, 2013",
|
||||
"2013-09-11",
|
||||
"Matthew Honnibal",
|
||||
"There are a tonne of “best known techniques” for POS tagging, and you " +
|
||||
"should ignore the others and just use greedy Averaged Perceptron."
|
||||
)
|
||||
|
||||
section.intro
|
||||
h2
|
||||
a.permalink(href='#tutorials', name='tutorials') Tutorials
|
||||
|
||||
section.tutorials
|
||||
include ./tutorials.jade
|
||||
|
||||
footer(role="contentinfo")
|
||||
span.slogan.copyright © 2015 Syllogism Co.
|
||||
|
||||
script(src='js/prism.js')
|
|
@ -1,81 +0,0 @@
|
|||
extends ./template_post.jade
|
||||
|
||||
-
|
||||
var urls = {
|
||||
'pos_post': 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/',
|
||||
'google_ngrams': "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html",
|
||||
'implementation': 'https://gist.github.com/syllog1sm/10343947',
|
||||
'redshift': 'http://github.com/syllog1sm/redshift',
|
||||
'tasker': 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm',
|
||||
'acl_anthology': 'http://aclweb.org/anthology/',
|
||||
'share_twitter': 'http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal'
|
||||
}
|
||||
|
||||
- var my_research_software = '<a href="https://github.com/syllog1sm/redshift/tree/develop">my research software</a>'
|
||||
|
||||
- var how_to_write_a_POS_tagger = '<a href="https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/">how to write a part-of-speech tagger</a>'
|
||||
|
||||
- var parser_lnk = '<a href="https://honnibal.wordpress.com/2013/12/18/a-simple-fast-algorithm-for-natural-language-dependency-parsing/">parser</a>'
|
||||
|
||||
- var buy_a_commercial_license = '<a href="license.html">buy a commercial license</a>'
|
||||
|
||||
|
||||
block body_block
|
||||
article.post
|
||||
p.
|
||||
<strong>spaCy</strong> is a new library for text processing in Python
|
||||
and Cython. I wrote it because I think small companies are terrible at
|
||||
natural language processing (NLP). Or rather: small companies are using
|
||||
terrible NLP technology.
|
||||
|
||||
p.
|
||||
To do great NLP, you have to know a little about linguistics, a lot
|
||||
about machine learning, and almost everything about the latest research.
|
||||
The people who fit this description seldom join small companies.
|
||||
Most are broke – they've just finished grad school.
|
||||
If they don't want to stay in academia, they join Google, IBM, etc.
|
||||
|
||||
p.
|
||||
The net result is that outside of the tech giants, commercial NLP has
|
||||
changed little in the last ten years. In academia, it's changed entirely.
|
||||
Amazing improvements in quality. Orders of magnitude faster. But the
|
||||
academic code is always GPL, undocumented, unuseable, or all three.
|
||||
You could implement the ideas yourself, but the papers are hard to read,
|
||||
and training data is exorbitantly expensive. So what are you left with?
|
||||
A common answer is NLTK, which was written primarily as an educational resource.
|
||||
Nothing past the tokenizer is suitable for production use.
|
||||
|
||||
p.
|
||||
I used to think that the NLP community just needed to do more to communicate
|
||||
its findings to software engineers. So I wrote two blog posts, explaining
|
||||
!{how_to_write_a_POS_tagger} and !{parser_lnk}. Both were well
|
||||
received, and there's been a bit of interest in !{my_research_software}
|
||||
– even though it's entirely undocumented, and mostly unuseable to
|
||||
anyone but me.
|
||||
p.
|
||||
So six months ago I quit my post-doc, and I've been working day and night
|
||||
on spaCy since. I'm now pleased to announce an alpha release.
|
||||
|
||||
p.
|
||||
If you're a small company doing NLP, I think spaCy will seem like a minor
|
||||
miracle. It's by far the fastest NLP software ever released. The
|
||||
full processing pipeline completes in 20ms per document, including accurate
|
||||
tagging and parsing. All strings are mapped to integer IDs, tokens are
|
||||
linked to embedded word representations, and a range of useful features
|
||||
are pre-calculated and cached.
|
||||
|
||||
p.
|
||||
If none of that made any sense to you, here's the gist of it. Computers
|
||||
don't understand text. This is unfortunate, because that's what the
|
||||
web almost entirely consists of. We want to recommend people text based
|
||||
on other text they liked. We want to shorten text to display it on a
|
||||
mobile screen. We want to aggregate it, link it, filter it, categorise
|
||||
it, generate it and correct it.
|
||||
|
||||
p.
|
||||
spaCy provides a library of utility functions that help programmers
|
||||
build such products. It's commercial open source software: you can
|
||||
either use it under the AGPL, or you can !{buy_a_commercial_license}
|
||||
under generous terms.
|
||||
|
||||
footer(role='contentinfo')
|
|
@ -1,938 +0,0 @@
|
|||
extends ./template_post.jade
|
||||
|
||||
|
||||
block body_block
|
||||
- var urls = {}
|
||||
//- urls.pos_post = 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/'
|
||||
- urls.parser_post = "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html"
|
||||
- urls.implementation = 'https://gist.github.com/syllog1sm/10343947'
|
||||
- urls.redshift = 'http://github.com/syllog1sm/redshift'
|
||||
- urls.tasker = 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm'
|
||||
- urls.acl_anthology = 'http://aclweb.org/anthology/'
|
||||
- urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal"
|
||||
|
||||
// A comment
|
||||
|
||||
article.post
|
||||
header
|
||||
h2 Parsing English in 500 lines of Python
|
||||
.subhead
|
||||
| by
|
||||
a(href='#', rel='author') Matthew Honnibal
|
||||
| on
|
||||
time(datetime='2013-12-18') December 18, 2013
|
||||
p
|
||||
| A
|
||||
a(href=urls.google_ngrams) syntactic parser
|
||||
| describes a sentence’s grammatical structure, to help another
|
||||
| application reason about it. Natural languages introduce many unexpected
|
||||
| ambiguities, which our world-knowledge immediately filters out. A
|
||||
| favourite example:
|
||||
|
||||
p.example They ate the pizza with anchovies
|
||||
|
||||
p
|
||||
img(src='img/blog01.png', alt='Eat-with pizza-with ambiguity')
|
||||
p
|
||||
| A correct parse links “with” to “pizza”, while an incorrect parse
|
||||
| links “with” to “eat”:
|
||||
|
||||
.displacy
|
||||
iframe(src='displacy/anchovies_bad.html', height='275')
|
||||
|
||||
.displacy
|
||||
iframe.displacy(src='displacy/anchovies_good.html', height='275')
|
||||
a.view-displacy(href='#') View on displaCy
|
||||
p.caption
|
||||
| The Natural Language Processing (NLP) community has made big progress
|
||||
| in syntactic parsing over the last few years.
|
||||
|
||||
p
|
||||
| The Natural Language Processing (NLP) community has made big progress
|
||||
| in syntactic parsing over the last few years. It’s now possible for
|
||||
| a tiny Python implementation to perform better than the widely-used
|
||||
| Stanford PCFG parser.
|
||||
|
||||
p
|
||||
strong Update!
|
||||
| The Stanford CoreNLP library now includes a greedy transition-based
|
||||
| dependency parser, similar to the one described in this post, but with
|
||||
| an improved learning strategy. It is much faster and more accurate
|
||||
| than this simple Python implementation.
|
||||
|
||||
table
|
||||
thead
|
||||
tr
|
||||
th Parser
|
||||
th Accuracy
|
||||
th Speed (w/s)
|
||||
th Language
|
||||
th LOC
|
||||
tbody
|
||||
tr
|
||||
td Stanford
|
||||
td 89.6%
|
||||
td 19
|
||||
td Java
|
||||
td
|
||||
| > 4,000
|
||||
sup
|
||||
a(href='#note-1') [1]
|
||||
tr
|
||||
td
|
||||
strong parser.py
|
||||
td 89.8%
|
||||
td 2,020
|
||||
td Python
|
||||
strong ~500
|
||||
tr
|
||||
td Redshift
|
||||
td
|
||||
strong 93.6%
|
||||
td
|
||||
strong 2,580
|
||||
td Cython
|
||||
td ~4,000
|
||||
p
|
||||
| The rest of the post sets up the problem, and then takes you through
|
||||
a(href=urls.implementation) a concise implementation
|
||||
| , prepared for this post. The first 200 lines of parser.py, the
|
||||
| part-of-speech tagger and learner, are described
|
||||
a(href=pos_tagger_url) here. You should probably at least skim that
|
||||
| post before reading this one, unless you’re very familiar with NLP
|
||||
| research.
|
||||
p
|
||||
| The Cython system, Redshift, was written for my current research. I
|
||||
| plan to improve it for general use in June, after my contract ends
|
||||
| at Macquarie University. The current version is
|
||||
a(href=urls.redshift) hosted on GitHub
|
||||
| .
|
||||
h3 Problem Description
|
||||
|
||||
p It’d be nice to type an instruction like this into your phone:
|
||||
|
||||
p.example
|
||||
Set volume to zero when I’m in a meeting, unless John’s school calls.
|
||||
p
|
||||
| And have it set the appropriate policy. On Android you can do this
|
||||
| sort of thing with
|
||||
a(href=urls.tasker) Tasker
|
||||
| , but an NL interface would be much better. It’d be especially nice
|
||||
| to receive a meaning representation you could edit, so you could see
|
||||
| what it thinks you said, and correct it.
|
||||
p
|
||||
| There are lots of problems to solve to make that work, but some sort
|
||||
| of syntactic representation is definitely necessary. We need to know that:
|
||||
|
||||
p.example
|
||||
Unless John’s school calls, when I’m in a meeting, set volume to zero
|
||||
|
||||
p is another way of phrasing the first instruction, while:
|
||||
|
||||
p.example
|
||||
Unless John’s school, call when I’m in a meeting
|
||||
|
||||
p means something completely different.
|
||||
|
||||
p
|
||||
| A dependency parser returns a graph of word-word relationships,
|
||||
| intended to make such reasoning easier. Our graphs will be trees –
|
||||
| edges will be directed, and every node (word) will have exactly one
|
||||
| incoming arc (one dependency, with its head), except one.
|
||||
|
||||
h4 Example usage
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| parser = parser.Parser()
|
||||
| tokens = "Set the volume to zero when I 'm in a meeting unless John 's school calls".split()
|
||||
| >>> tags, heads = parser.parse(tokens)
|
||||
| >>> heads
|
||||
| [-1, 2, 0, 0, 3, 0, 7, 5, 7, 10, 8, 0, 13, 15, 15, 11]
|
||||
| >>> for i, h in enumerate(heads):
|
||||
| ... head = tokens[heads[h]] if h >= 1 else 'None'
|
||||
| ... print(tokens[i] + ' <-- ' + head])
|
||||
| Set <-- None
|
||||
| the <-- volume
|
||||
| volume <-- Set
|
||||
| to <-- Set
|
||||
| zero <-- to
|
||||
| when <-- Set
|
||||
| I <-- 'm
|
||||
| 'm <-- when
|
||||
| in <-- 'm
|
||||
| a <-- meeting
|
||||
| meeting <-- in
|
||||
| unless <-- Set
|
||||
| John <-- 's
|
||||
| 's <-- calls
|
||||
| school <-- calls
|
||||
| calls <-- unless
|
||||
|
||||
p.
|
||||
The idea is that it should be slightly easier to reason from the parse,
|
||||
than it was from the string. The parse-to-meaning mapping is hopefully
|
||||
simpler than the string-to-meaning mapping.
|
||||
|
||||
p.
|
||||
The most confusing thing about this problem area is that “correctness”
|
||||
is defined by convention — by annotation guidelines. If you haven’t
|
||||
read the guidelines and you’re not a linguist, you can’t tell whether
|
||||
the parse is “wrong” or “right”, which makes the whole task feel weird
|
||||
and artificial.
|
||||
|
||||
p.
|
||||
For instance, there’s a mistake in the parse above: “John’s school
|
||||
calls” is structured wrongly, according to the Stanford annotation
|
||||
guidelines. The structure of that part of the sentence is how the
|
||||
annotators were instructed to parse an example like “John’s school
|
||||
clothes”.
|
||||
|
||||
p
|
||||
| It’s worth dwelling on this point a bit. We could, in theory, have
|
||||
| written our guidelines so that the “correct” parses were reversed.
|
||||
| There’s good reason to believe the parsing task will be harder if we
|
||||
| reversed our convention, as it’d be less consistent with the rest of
|
||||
| the grammar.
|
||||
sup: a(href='#note-2') [2]
|
||||
| But we could test that empirically, and we’d be pleased to gain an
|
||||
| advantage by reversing the policy.
|
||||
|
||||
p
|
||||
| We definitely do want that distinction in the guidelines — we don’t
|
||||
| want both to receive the same structure, or our output will be less
|
||||
| useful. The annotation guidelines strike a balance between what
|
||||
| distinctions downstream applications will find useful, and what
|
||||
| parsers will be able to predict easily.
|
||||
|
||||
h4 Projective trees
|
||||
|
||||
p
|
||||
| There’s a particularly useful simplification that we can make, when
|
||||
| deciding what we want the graph to look like: we can restrict the
|
||||
| graph structures we’ll be dealing with. This doesn’t just give us a
|
||||
| likely advantage in learnability; it can have deep algorithmic
|
||||
| implications. We follow most work on English in constraining the
|
||||
| dependency graphs to be
|
||||
em projective trees
|
||||
| :
|
||||
|
||||
ol
|
||||
li Tree. Every word has exactly one head, except for the dummy ROOT symbol.
|
||||
li
|
||||
| Projective. For every pair of dependencies (a1, a2) and (b1, b2),
|
||||
| if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”.
|
||||
| You can’t have a pair of dependencies that goes a1 b1 a2 b2, or
|
||||
| b1 a1 b2 a2.
|
||||
|
||||
p
|
||||
| There’s a rich literature on parsing non-projective trees, and a
|
||||
| smaller literature on parsing DAGs. But the parsing algorithm I’ll
|
||||
| be explaining deals with projective trees.
|
||||
|
||||
h3 Greedy transition-based parsing
|
||||
|
||||
p
|
||||
| Our parser takes as input a list of string tokens, and outputs a
|
||||
| list of head indices, representing edges in the graph. If the
|
||||
|
||||
em i
|
||||
|
||||
| th member of heads is
|
||||
|
||||
em j
|
||||
|
||||
| , the dependency parse contains an edge (j, i). A transition-based
|
||||
| parser is a finite-state transducer; it maps an array of N words
|
||||
| onto an output array of N head indices:
|
||||
|
||||
table.center
|
||||
tbody
|
||||
tr
|
||||
td
|
||||
em start
|
||||
td MSNBC
|
||||
td reported
|
||||
td that
|
||||
td Facebook
|
||||
td bought
|
||||
td WhatsApp
|
||||
td for
|
||||
td $16bn
|
||||
td
|
||||
em root
|
||||
tr
|
||||
td 0
|
||||
td 2
|
||||
td 9
|
||||
td 2
|
||||
td 4
|
||||
td 2
|
||||
td 4
|
||||
td 4
|
||||
td 7
|
||||
td 0
|
||||
p
|
||||
| The heads array denotes that the head of
|
||||
em MSNBC
|
||||
| is
|
||||
em reported
|
||||
| :
|
||||
em MSNBC
|
||||
| is word 1, and
|
||||
em reported
|
||||
| is word 2, and
|
||||
code.language-python heads[1] == 2
|
||||
| . You can already see why parsing a tree is handy — this data structure
|
||||
| wouldn’t work if we had to output a DAG, where words may have multiple
|
||||
| heads.
|
||||
|
||||
p
|
||||
| Although
|
||||
code.language-python heads
|
||||
| can be represented as an array, we’d actually like to maintain some
|
||||
| alternate ways to access the parse, to make it easy and efficient to
|
||||
| extract features. Our
|
||||
|
||||
code.language-python Parse
|
||||
| class looks like this:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| class Parse(object):
|
||||
| def __init__(self, n):
|
||||
| self.n = n
|
||||
| self.heads = [None] * (n-1)
|
||||
| self.lefts = []
|
||||
| self.rights = []
|
||||
| for i in range(n+1):
|
||||
| self.lefts.append(DefaultList(0))
|
||||
| self.rights.append(DefaultList(0))
|
||||
|
|
||||
| def add_arc(self, head, child):
|
||||
| self.heads[child] = head
|
||||
| if child < head:
|
||||
| self.lefts[head].append(child)
|
||||
| else:
|
||||
| self.rights[head].append(child)
|
||||
|
||||
p
|
||||
| As well as the parse, we also have to keep track of where we’re up
|
||||
| to in the sentence. We’ll do this with an index into the
|
||||
code.language-python words
|
||||
| array, and a stack, to which we’ll push words, before popping them
|
||||
| once their head is set. So our state data structure is fundamentally:
|
||||
|
||||
ul
|
||||
li An index, i, into the list of tokens;
|
||||
li The dependencies added so far, in Parse
|
||||
li
|
||||
| A stack, containing words that occurred before i, for which we’re
|
||||
| yet to assign a head.
|
||||
|
||||
p Each step of the parsing process applies one of three actions to the state:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| SHIFT = 0; RIGHT = 1; LEFT = 2
|
||||
| MOVES = [SHIFT, RIGHT, LEFT]
|
||||
|
|
||||
| def transition(move, i, stack, parse):
|
||||
| global SHIFT, RIGHT, LEFT
|
||||
| if move == SHIFT:
|
||||
| stack.append(i)
|
||||
| return i + 1
|
||||
| elif move == RIGHT:
|
||||
| parse.add_arc(stack[-2], stack.pop())
|
||||
| return i
|
||||
| elif move == LEFT:
|
||||
| parse.add_arc(i, stack.pop())
|
||||
| return i
|
||||
| raise GrammarError("Unknown move: %d" % move)
|
||||
|
||||
|
||||
|
||||
p
|
||||
| The
|
||||
code.language-python LEFT
|
||||
| and
|
||||
code.language-python RIGHT
|
||||
| actions add dependencies and pop the stack, while
|
||||
code.language-python SHIFT
|
||||
| pushes the stack and advances i into the buffer.
|
||||
p.
|
||||
So, the parser starts with an empty stack, and a buffer index at 0, with
|
||||
no dependencies recorded. It chooses one of the (valid) actions, and
|
||||
applies it to the state. It continues choosing actions and applying
|
||||
them until the stack is empty and the buffer index is at the end of
|
||||
the input. (It’s hard to understand this sort of algorithm without
|
||||
stepping through it. Try coming up with a sentence, drawing a projective
|
||||
parse tree over it, and then try to reach the parse tree by choosing
|
||||
the right sequence of transitions.)
|
||||
|
||||
p Here’s what the parsing loop looks like in code:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| class Parser(object):
|
||||
| ...
|
||||
| def parse(self, words):
|
||||
| tags = self.tagger(words)
|
||||
| n = len(words)
|
||||
| idx = 1
|
||||
| stack = [0]
|
||||
| deps = Parse(n)
|
||||
| while stack or idx < n:
|
||||
| features = extract_features(words, tags, idx, n, stack, deps)
|
||||
| scores = self.model.score(features)
|
||||
| valid_moves = get_valid_moves(i, n, len(stack))
|
||||
| next_move = max(valid_moves, key=lambda move: scores[move])
|
||||
| idx = transition(next_move, idx, stack, parse)
|
||||
| return tags, parse
|
||||
|
|
||||
| def get_valid_moves(i, n, stack_depth):
|
||||
| moves = []
|
||||
| if i < n:
|
||||
| moves.append(SHIFT)
|
||||
| if stack_depth <= 2:
|
||||
| moves.append(RIGHT)
|
||||
| if stack_depth <= 1:
|
||||
| moves.append(LEFT)
|
||||
| return moves
|
||||
|
||||
p.
|
||||
We start by tagging the sentence, and initializing the state. We then
|
||||
map the state to a set of features, which we score using a linear model.
|
||||
We then find the best-scoring valid move, and apply it to the state.
|
||||
|
||||
p
|
||||
| The model scoring works the same as it did in
|
||||
a(href=urls.post) the POS tagger.
|
||||
| If you’re confused about the idea of extracting features and scoring
|
||||
| them with a linear model, you should review that post. Here’s a reminder
|
||||
| of how the model scoring works:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| class Perceptron(object)
|
||||
| ...
|
||||
| def score(self, features):
|
||||
| all_weights = self.weights
|
||||
| scores = dict((clas, 0) for clas in self.classes)
|
||||
| for feat, value in features.items():
|
||||
| if value == 0:
|
||||
| continue
|
||||
| if feat not in all_weights:
|
||||
| continue
|
||||
| weights = all_weights[feat]
|
||||
| for clas, weight in weights.items():
|
||||
| scores[clas] += value * weight
|
||||
| return scores
|
||||
|
||||
p.
|
||||
It’s just summing the class-weights for each feature. This is often
|
||||
expressed as a dot-product, but when you’re dealing with multiple
|
||||
classes, that gets awkward, I find.
|
||||
|
||||
p.
|
||||
The beam parser (RedShift) tracks multiple candidates, and only decides
|
||||
on the best one at the very end. We’re going to trade away accuracy
|
||||
in favour of efficiency and simplicity. We’ll only follow a single
|
||||
analysis. Our search strategy will be entirely greedy, as it was with
|
||||
the POS tagger. We’ll lock-in our choices at every step.
|
||||
|
||||
p.
|
||||
If you read the POS tagger post carefully, you might see the underlying
|
||||
similarity. What we’ve done is mapped the parsing problem onto a
|
||||
sequence-labelling problem, which we address using a “flat”, or unstructured,
|
||||
learning algorithm (by doing greedy search).
|
||||
|
||||
h3 Features
|
||||
p.
|
||||
Feature extraction code is always pretty ugly. The features for the parser
|
||||
refer to a few tokens from the context:
|
||||
|
||||
ul
|
||||
li The first three words of the buffer (n0, n1, n2)
|
||||
li The top three words of the stack (s0, s1, s2)
|
||||
li The two leftmost children of s0 (s0b1, s0b2);
|
||||
li The two rightmost children of s0 (s0f1, s0f2);
|
||||
li The two leftmost children of n0 (n0b1, n0b2)
|
||||
|
||||
p.
|
||||
For these 12 tokens, we refer to the word-form, the part-of-speech tag,
|
||||
and the number of left and right children attached to the token.
|
||||
|
||||
p.
|
||||
Because we’re using a linear model, we have our features refer to pairs
|
||||
and triples of these atomic properties.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def extract_features(words, tags, n0, n, stack, parse):
|
||||
| def get_stack_context(depth, stack, data):
|
||||
| if depth >= 3:
|
||||
| return data[stack[-1]], data[stack[-2]], data[stack[-3]]
|
||||
| elif depth >= 2:
|
||||
| return data[stack[-1]], data[stack[-2]], ''
|
||||
| elif depth == 1:
|
||||
| return data[stack[-1]], '', ''
|
||||
| else:
|
||||
| return '', '', ''
|
||||
|
|
||||
| def get_buffer_context(i, n, data):
|
||||
| if i + 1 >= n:
|
||||
| return data[i], '', ''
|
||||
| elif i + 2 >= n:
|
||||
| return data[i], data[i + 1], ''
|
||||
| else:
|
||||
| return data[i], data[i + 1], data[i + 2]
|
||||
|
|
||||
| def get_parse_context(word, deps, data):
|
||||
| if word == -1:
|
||||
| return 0, '', ''
|
||||
| deps = deps[word]
|
||||
| valency = len(deps)
|
||||
| if not valency:
|
||||
| return 0, '', ''
|
||||
| elif valency == 1:
|
||||
| return 1, data[deps[-1]], ''
|
||||
| else:
|
||||
| return valency, data[deps[-1]], data[deps[-2]]
|
||||
|
|
||||
| features = {}
|
||||
| # Set up the context pieces --- the word, W, and tag, T, of:
|
||||
| # S0-2: Top three words on the stack
|
||||
| # N0-2: First three words of the buffer
|
||||
| # n0b1, n0b2: Two leftmost children of the first word of the buffer
|
||||
| # s0b1, s0b2: Two leftmost children of the top word of the stack
|
||||
| # s0f1, s0f2: Two rightmost children of the top word of the stack
|
||||
|
|
||||
| depth = len(stack)
|
||||
| s0 = stack[-1] if depth else -1
|
||||
|
|
||||
| Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words)
|
||||
| Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags)
|
||||
|
|
||||
| Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words)
|
||||
| Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags)
|
||||
|
|
||||
| Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words)
|
||||
| Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags)
|
||||
|
|
||||
| Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words)
|
||||
| _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags)
|
||||
|
|
||||
| Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words)
|
||||
| _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags)
|
||||
|
|
||||
| Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words)
|
||||
| _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags)
|
||||
|
|
||||
| # Cap numeric features at 5?
|
||||
| # String-distance
|
||||
| Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0
|
||||
|
|
||||
| features['bias'] = 1
|
||||
| # Add word and tag unigrams
|
||||
| for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2):
|
||||
| if w:
|
||||
| features['w=%s' % w] = 1
|
||||
| for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2):
|
||||
| if t:
|
||||
| features['t=%s' % t] = 1
|
||||
|
|
||||
| # Add word/tag pairs
|
||||
| for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))):
|
||||
| if w or t:
|
||||
| features['%d w=%s, t=%s' % (i, w, t)] = 1
|
||||
|
|
||||
| # Add some bigrams
|
||||
| features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1
|
||||
| features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1
|
||||
| features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1
|
||||
| features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1
|
||||
| features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1
|
||||
| features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1
|
||||
| features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1
|
||||
| features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1
|
||||
|
|
||||
| # Add some tag trigrams
|
||||
| trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0),
|
||||
| (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1),
|
||||
| (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2),
|
||||
| (Ts0, Ts1, Ts1))
|
||||
| for i, (t1, t2, t3) in enumerate(trigrams):
|
||||
| if t1 or t2 or t3:
|
||||
| features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1
|
||||
|
|
||||
| # Add some valency and distance features
|
||||
| vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b))
|
||||
| vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b))
|
||||
| d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0),
|
||||
| ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0))
|
||||
| for i, (w_t, v_d) in enumerate(vw + vt + d):
|
||||
| if w_t or v_d:
|
||||
| features['val/d-%d %s %d' % (i, w_t, v_d)] = 1
|
||||
| return features</code></pre>
|
||||
|
||||
|
||||
h3 Training
|
||||
|
||||
p.
|
||||
Weights are learned using the same algorithm, averaged perceptron, that
|
||||
we used for part-of-speech tagging. Its key strength is that it’s an
|
||||
online learning algorithm: examples stream in one-by-one, we make our
|
||||
prediction, check the actual answer, and adjust our beliefs (weights)
|
||||
if we were wrong.
|
||||
|
||||
p The training loop looks like this:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| class Parser(object):
|
||||
| ...
|
||||
| def train_one(self, itn, words, gold_tags, gold_heads):
|
||||
| n = len(words)
|
||||
| i = 2; stack = [1]; parse = Parse(n)
|
||||
| tags = self.tagger.tag(words)
|
||||
| while stack or (i + 1) < n:
|
||||
| features = extract_features(words, tags, i, n, stack, parse)
|
||||
| scores = self.model.score(features)
|
||||
| valid_moves = get_valid_moves(i, n, len(stack))
|
||||
| guess = max(valid_moves, key=lambda move: scores[move])
|
||||
| gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads)
|
||||
| best = max(gold_moves, key=lambda move: scores[move])
|
||||
| self.model.update(best, guess, features)
|
||||
| i = transition(guess, i, stack, parse)
|
||||
| # Return number correct
|
||||
| return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]])
|
||||
|
||||
|
||||
|
||||
p
|
||||
| The most interesting part of the training process is in
|
||||
code.language-python get_gold_moves.
|
||||
| The performance of our parser is made possible by an advance by Goldberg
|
||||
| and Nivre (2012), who showed that we’d been doing this wrong for years.
|
||||
|
||||
p
|
||||
| In the POS-tagging post, I cautioned that during training you need to
|
||||
| make sure you pass in the last two
|
||||
em predicted
|
||||
| tags as features for the current tag, not the last two
|
||||
em gold
|
||||
| tags. At test time you’ll only have the predicted tags, so if you
|
||||
| base your features on the gold sequence during training, your training
|
||||
| contexts won’t resemble your test-time contexts, so you’ll learn the
|
||||
| wrong weights.
|
||||
|
||||
p.
|
||||
In parsing, the problem was that we didn’t know
|
||||
em how
|
||||
| to pass in the predicted sequence! Training worked by taking the
|
||||
| gold-standard tree, and finding a transition sequence that led to it.
|
||||
| i.e., you got back a sequence of moves, with the guarantee that if
|
||||
| you followed those moves, you’d get the gold-standard dependencies.
|
||||
|
||||
p
|
||||
| The problem is, we didn’t know how to define the “correct” move to
|
||||
| teach a parser to make if it was in any state that
|
||||
em wasn’t
|
||||
| along that gold-standard sequence. Once the parser had made a mistake,
|
||||
| we didn’t know how to train from that example.
|
||||
|
||||
p
|
||||
| That was a big problem, because it meant that once the parser started
|
||||
| making mistakes, it would end up in states unlike any in its training
|
||||
| data – leading to yet more mistakes. The problem was specific
|
||||
| to greedy parsers: once you use a beam, there’s a natural way to do
|
||||
| structured prediction.
|
||||
p
|
||||
| The solution seems obvious once you know it, like all the best breakthroughs.
|
||||
| What we do is define a function that asks “How many gold-standard
|
||||
| dependencies can be recovered from this state?”. If you can define
|
||||
| that function, then you can apply each move in turn, and ask, “How
|
||||
| many gold-standard dependencies can be recovered from
|
||||
em this
|
||||
| state?”. If the action you applied allows
|
||||
em fewer
|
||||
| gold-standard dependencies to be reached, then it is sub-optimal.
|
||||
|
||||
p That’s a lot to take in.
|
||||
|
||||
p
|
||||
| So we have this function
|
||||
code Oracle(state)
|
||||
| :
|
||||
pre
|
||||
code
|
||||
| Oracle(state) = | gold_arcs ∩ reachable_arcs(state) |
|
||||
p
|
||||
| We also have a set of actions, each of which returns a new state.
|
||||
| We want to know:
|
||||
|
||||
ul
|
||||
li shift_cost = Oracle(state) – Oracle(shift(state))
|
||||
li right_cost = Oracle(state) – Oracle(right(state))
|
||||
li left_cost = Oracle(state) – Oracle(left(state))
|
||||
|
||||
p
|
||||
| Now, at least one of those costs
|
||||
em has
|
||||
| to be zero. Oracle(state) is asking, “what’s the cost of the best
|
||||
| path forward?”, and the first action of that best path has to be
|
||||
| shift, right, or left.
|
||||
|
||||
p
|
||||
| It turns out that we can derive Oracle fairly simply for many transition
|
||||
| systems. The derivation for the transition system we’re using, Arc
|
||||
| Hybrid, is in Goldberg and Nivre (2013).
|
||||
|
||||
p
|
||||
| We’re going to implement the oracle as a function that returns the
|
||||
| zero-cost moves, rather than implementing a function Oracle(state).
|
||||
| This prevents us from doing a bunch of costly copy operations.
|
||||
| Hopefully the reasoning in the code isn’t too hard to follow, but
|
||||
| you can also consult Goldberg and Nivre’s papers if you’re confused
|
||||
| and want to get to the bottom of this.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def get_gold_moves(n0, n, stack, heads, gold):
|
||||
| def deps_between(target, others, gold):
|
||||
| for word in others:
|
||||
| if gold[word] == target or gold[target] == word:
|
||||
| return True
|
||||
| return False
|
||||
|
|
||||
| valid = get_valid_moves(n0, n, len(stack))
|
||||
| if not stack or (SHIFT in valid and gold[n0] == stack[-1]):
|
||||
| return [SHIFT]
|
||||
| if gold[stack[-1]] == n0:
|
||||
| return [LEFT]
|
||||
| costly = set([m for m in MOVES if m not in valid])
|
||||
| # If the word behind s0 is its gold head, Left is incorrect
|
||||
| if len(stack) >= 2 and gold[stack[-1]] == stack[-2]:
|
||||
| costly.add(LEFT)
|
||||
| # If there are any dependencies between n0 and the stack,
|
||||
| # pushing n0 will lose them.
|
||||
| if SHIFT not in costly and deps_between(n0, stack, gold):
|
||||
| costly.add(SHIFT)
|
||||
| # If there are any dependencies between s0 and the buffer, popping
|
||||
| # s0 will lose them.
|
||||
| if deps_between(stack[-1], range(n0+1, n-1), gold):
|
||||
| costly.add(LEFT)
|
||||
| costly.add(RIGHT)
|
||||
| return [m for m in MOVES if m not in costly]</code></pre>
|
||||
|
||||
|
||||
|
||||
p
|
||||
| Doing this “dynamic oracle” training procedure makes a big difference
|
||||
| to accuracy — typically 1-2%, with no difference to the way the run-time
|
||||
| works. The old “static oracle” greedy training procedure is fully
|
||||
| obsolete; there’s no reason to do it that way any more.
|
||||
|
||||
h3 Conclusion
|
||||
|
||||
p
|
||||
| I have the sense that language technologies, particularly those relating
|
||||
| to grammar, are particularly mysterious. I can imagine having no idea
|
||||
| what the program might even do.
|
||||
|
||||
p
|
||||
| I think it therefore seems natural to people that the best solutions
|
||||
| would be over-whelmingly complicated. A 200,000 line Java package
|
||||
| feels appropriate.
|
||||
p
|
||||
| But, algorithmic code is usually short, when only a single algorithm
|
||||
| is implemented. And when you only implement one algorithm, and you
|
||||
| know exactly what you want to write before you write a line, you
|
||||
| also don’t pay for any unnecessary abstractions, which can have a
|
||||
| big performance impact.
|
||||
|
||||
h3 Notes
|
||||
p
|
||||
a(name='note-1')
|
||||
| [1] I wasn’t really sure how to count the lines of code in the Stanford
|
||||
| parser. Its jar file ships over 200k, but there are a lot of different
|
||||
| models in it. It’s not important, but it's certainly over 4k.
|
||||
|
||||
p
|
||||
a(name='note-2')
|
||||
| [2] For instance, how would you parse, “John’s school of music calls”?
|
||||
| You want to make sure the phrase “John’s school” has a consistent
|
||||
| structure in both “John’s school calls” and “John’s school of music
|
||||
| calls”. Reasoning about the different “slots” you can put a phrase
|
||||
| into is a key way we reason about what syntactic analyses look like.
|
||||
| You can think of each phrase as having a different shaped connector,
|
||||
| which you need to plug into different slots — which each phrase also
|
||||
| has a certain number of, each of a different shape. We’re trying to
|
||||
| figure out what connectors are where, so we can figure out how the
|
||||
| sentences are put together.
|
||||
|
||||
h3 Idle speculation
|
||||
p
|
||||
| For a long time, incremental language processing algorithms were
|
||||
| primarily of scientific interest. If you want to write a parser to
|
||||
| test a theory about how the human sentence processor might work, well,
|
||||
| that parser needs to build partial interpretations. There’s a wealth
|
||||
| of evidence, including commonsense introspection, that establishes
|
||||
| that we don’t buffer input and analyse it once the speaker has finished.
|
||||
|
||||
p
|
||||
| But now algorithms with that neat scientific feature are winning!
|
||||
| As best as I can tell, the secret to that success is to be:
|
||||
|
||||
ul
|
||||
li Incremental. Earlier words constrain the search.
|
||||
li
|
||||
| Error-driven. Training involves a working hypothesis, which is
|
||||
| updated as it makes mistakes.
|
||||
|
||||
p
|
||||
| The links to human sentence processing seem tantalising. I look
|
||||
| forward to seeing whether these engineering breakthroughs lead to
|
||||
| any psycholinguistic advances.
|
||||
|
||||
h3 Bibliography
|
||||
|
||||
p
|
||||
| The NLP literature is almost entirely open access. All of the relavant
|
||||
| papers can be found
|
||||
a(href=urls.acl_anthology, rel='nofollow') here
|
||||
| .
|
||||
p
|
||||
| The parser I’ve described is an implementation of the dynamic-oracle
|
||||
| Arc-Hybrid system here:
|
||||
|
||||
span.bib-item
|
||||
| Goldberg, Yoav; Nivre, Joakim.
|
||||
em Training Deterministic Parsers with Non-Deterministic Oracles
|
||||
| . TACL 2013
|
||||
p
|
||||
| However, I wrote my own features for it. The arc-hybrid system was
|
||||
| originally described here:
|
||||
|
||||
span.bib-item
|
||||
| Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic
|
||||
| programming algorithms for transition-based dependency parsers. ACL 2011
|
||||
|
||||
p
|
||||
| The dynamic oracle training method was first described here:
|
||||
span.bib-item
|
||||
| A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav;
|
||||
| Nivre, Joakim. COLING 2012
|
||||
|
||||
p
|
||||
| This work depended on a big break-through in accuracy for transition-based
|
||||
| parsers, when beam-search was properly explored by Zhang and Clark.
|
||||
| They have several papers, but the preferred citation is:
|
||||
|
||||
span.bib-item
|
||||
| Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized
|
||||
| Perceptron and Beam Search. Computational Linguistics 2011 (1)
|
||||
p
|
||||
| Another important paper was this little feature engineering paper,
|
||||
| which further improved the accuracy:
|
||||
|
||||
span.bib-item
|
||||
| Zhang, Yue; Nivre, Joakim. Transition-based Dependency Parsing with
|
||||
| Rich Non-local Features. ACL 2011
|
||||
|
||||
p
|
||||
| The generalised perceptron, which is the learning framework for these
|
||||
| beam parsers, is from this paper:
|
||||
span.bib-item
|
||||
| Collins, Michael. Discriminative Training Methods for Hidden Markov
|
||||
| Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002
|
||||
|
||||
h3 Experimental details
|
||||
p
|
||||
| The results at the start of the post refer to Section 22 of the Wall
|
||||
| Street Journal corpus. The Stanford parser was run as follows:
|
||||
|
||||
pre.language-bash
|
||||
code
|
||||
| java -mx10000m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \
|
||||
| -outputFormat "penn" edu/stanford/nlp/models/lexparser/englishFactored.ser.gz $*
|
||||
|
||||
|
||||
|
||||
p
|
||||
| A small post-process was applied, to undo the fancy tokenisation
|
||||
| Stanford adds for numbers, to make them match the PTB tokenisation:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| """Stanford parser retokenises numbers. Split them."""
|
||||
| import sys
|
||||
| import re
|
||||
|
|
||||
| qp_re = re.compile('\xc2\xa0')
|
||||
| for line in sys.stdin:
|
||||
| line = line.rstrip()
|
||||
| if qp_re.search(line):
|
||||
| line = line.replace('(CD', '(QP (CD', 1) + ')'
|
||||
| line = line.replace('\xc2\xa0', ') (CD ')
|
||||
| print line
|
||||
|
||||
p
|
||||
| The resulting PTB-format files were then converted into dependencies
|
||||
| using the Stanford converter:
|
||||
|
||||
pre.language-bash
|
||||
code
|
||||
| ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp
|
||||
| ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/
|
||||
| ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll
|
||||
p
|
||||
| I can’t easily read that anymore, but it should just convert every
|
||||
| .mrg file in a folder to a CoNLL-format Stanford basic dependencies
|
||||
| file, using the settings common in the dependency literature.
|
||||
|
||||
p
|
||||
| I then converted the gold-standard trees from WSJ 22, for the evaluation.
|
||||
| Accuracy scores refer to unlabelled attachment score (i.e. the head index)
|
||||
| of all non-punctuation tokens.
|
||||
|
||||
p
|
||||
| To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21
|
||||
| into the same conversion script.
|
||||
|
||||
p
|
||||
| In a nutshell: The Stanford model and parser.py are trained on the
|
||||
| same set of sentences, and they each make their predictions on a
|
||||
| held-out test set, for which we know the answers. Accuracy refers
|
||||
| to how many of the words’ heads we got correct.
|
||||
|
||||
p
|
||||
| Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a
|
||||
| server, to give the Stanford parser more memory. The parser.py system
|
||||
| runs fine on my MacBook Air. I used PyPy for the parser.py experiments;
|
||||
| CPython was about half as fast on an early benchmark.
|
||||
|
||||
p
|
||||
| One of the reasons parser.py is so fast is that it does unlabelled
|
||||
| parsing. Based on previous experiments, a labelled parser would likely
|
||||
| be about 40x slower, and about 1% more accurate. Adapting the program
|
||||
| to labelled parsing would be a good exercise for the reader, if you
|
||||
| have access to the data.
|
||||
|
||||
p
|
||||
| The result from the Redshift parser was produced from commit
|
||||
code.language-python b6b624c9900f3bf
|
||||
| , which was run as follows:
|
||||
pre.language-bash
|
||||
code
|
||||
| ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp
|
||||
| ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/
|
||||
| ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll<
|
||||
|
||||
footer.meta(role='contentinfo')
|
||||
a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter
|
||||
.discuss
|
||||
a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News
|
||||
|
|
||||
a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit
|
|
@ -1,492 +0,0 @@
|
|||
extends ./template_post.jade
|
||||
|
||||
block body_block
|
||||
- var urls = {}
|
||||
- urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal"
|
||||
|
||||
|
||||
article.post
|
||||
header
|
||||
h2 A good Part-of-Speech tagger in about 200 lines of Python
|
||||
.subhead
|
||||
| by
|
||||
a(href="#" rel="author") Matthew Honnibal
|
||||
| on
|
||||
time(datetime='2013-09-11') October 11, 2013
|
||||
|
||||
p.
|
||||
Up-to-date knowledge about natural language processing is mostly locked away
|
||||
in academia. And academics are mostly pretty self-conscious when we write.
|
||||
We’re careful. We don’t want to stick our necks out too much. But under-confident
|
||||
recommendations suck, so here’s how to write a good part-of-speech tagger.
|
||||
|
||||
p.
|
||||
There are a tonne of “best known techniques” for POS tagging, and you should
|
||||
ignore the others and just use Averaged Perceptron.
|
||||
|
||||
p.
|
||||
You should use two tags of history, and features derived from the Brown word
|
||||
clusters distributed here.
|
||||
|
||||
p.
|
||||
If you only need the tagger to work on carefully edited text, you should
|
||||
use case-sensitive features, but if you want a more robust tagger you
|
||||
should avoid them because they’ll make you over-fit to the conventions
|
||||
of your training domain. Instead, features that ask “how frequently is
|
||||
this word title-cased, in a large sample from the web?” work well. Then
|
||||
you can lower-case your comparatively tiny training corpus.
|
||||
|
||||
p.
|
||||
For efficiency, you should figure out which frequent words in your training
|
||||
data have unambiguous tags, so you don’t have to do anything but output
|
||||
their tags when they come up. About 50% of the words can be tagged that way.
|
||||
|
||||
p.
|
||||
And unless you really, really can’t do without an extra 0.1% of accuracy,
|
||||
you probably shouldn’t bother with any kind of search strategy you should
|
||||
just use a greedy model.
|
||||
|
||||
p.
|
||||
If you do all that, you’ll find your tagger easy to write and understand,
|
||||
and an efficient Cython implementation will perform as follows on the standard
|
||||
evaluation, 130,000 words of text from the Wall Street Journal:
|
||||
|
||||
table
|
||||
thead
|
||||
tr
|
||||
th Tagger
|
||||
th Accuracy
|
||||
th Time (130k words)
|
||||
tbody
|
||||
tr
|
||||
td CyGreedyAP
|
||||
td 97.1%
|
||||
td 4s
|
||||
|
||||
p.
|
||||
The 4s includes initialisation time — the actual per-token speed is high
|
||||
enough to be irrelevant; it won’t be your bottleneck.
|
||||
|
||||
p.
|
||||
It’s tempting to look at 97% accuracy and say something similar, but that’s
|
||||
not true. My parser is about 1% more accurate if the input has hand-labelled
|
||||
POS tags, and the taggers all perform much worse on out-of-domain data.
|
||||
Unfortunately accuracies have been fairly flat for the last ten years.
|
||||
That’s why my recommendation is to just use a simple and fast tagger that’s
|
||||
roughly as good.
|
||||
|
||||
p.
|
||||
The thing is though, it’s very common to see people using taggers that
|
||||
aren’t anywhere near that good! For an example of what a non-expert is
|
||||
likely to use, these were the two taggers wrapped by TextBlob, a new Python
|
||||
api that I think is quite neat:
|
||||
|
||||
table
|
||||
thead
|
||||
tr
|
||||
th Tagger
|
||||
th Accuracy
|
||||
th Time (130k words)
|
||||
tbody
|
||||
tr
|
||||
td NLTK
|
||||
td 94.0%
|
||||
td 3m56s
|
||||
tr
|
||||
td Pattern
|
||||
td 93.5%
|
||||
td 26s
|
||||
|
||||
p.
|
||||
Both Pattern and NLTK are very robust and beautifully well documented, so
|
||||
the appeal of using them is obvious. But Pattern’s algorithms are pretty
|
||||
crappy, and NLTK carries tremendous baggage around in its implementation
|
||||
because of its massive framework, and double-duty as a teaching tool.
|
||||
|
||||
p.
|
||||
As a stand-alone tagger, my Cython implementation is needlessly complicated
|
||||
– it was written for my parser. So today I wrote a 200 line version
|
||||
of my recommended algorithm for TextBlob. It gets:
|
||||
|
||||
table
|
||||
thead
|
||||
tr
|
||||
th Tagger
|
||||
th Accuracy
|
||||
th Time (130k words)
|
||||
tbody
|
||||
tr
|
||||
td PyGreedyAP
|
||||
td 96.8%
|
||||
td 12s
|
||||
|
||||
p.
|
||||
I traded some accuracy and a lot of efficiency to keep the implementation
|
||||
simple. Here’s a far-too-brief description of how it works.
|
||||
|
||||
h3 Averaged perceptron
|
||||
|
||||
p.
|
||||
POS tagging is a “supervised learning problem”. You’re given a table of data,
|
||||
and you’re told that the values in the last column will be missing during
|
||||
run-time. You have to find correlations from the other columns to predict
|
||||
that value.
|
||||
|
||||
p.
|
||||
So for us, the missing column will be “part of speech at word i“. The predictor
|
||||
columns (features) will be things like “part of speech at word i-1“, “last three
|
||||
letters of word at i+1“, etc
|
||||
|
||||
p.
|
||||
First, here’s what prediction looks like at run-time:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def predict(self, features):
|
||||
| '''Dot-product the features and current weights and return the best class.'''
|
||||
| scores = defaultdict(float)
|
||||
| for feat in features:
|
||||
| if feat not in self.weights:
|
||||
| continue
|
||||
| weights = self.weights[feat]
|
||||
| for clas, weight in weights.items():
|
||||
| scores[clas] += weight
|
||||
| # Do a secondary alphabetic sort, for stability
|
||||
| return max(self.classes, key=lambda clas: (scores[clas], clas))
|
||||
|
||||
p.
|
||||
Earlier I described the learning problem as a table, with one of the columns
|
||||
marked as missing-at-runtime. For NLP, our tables are always exceedingly
|
||||
sparse. You have columns like “word i-1=Parliament”, which is almost always
|
||||
0. So our “weight vectors” can pretty much never be implemented as vectors.
|
||||
Map-types are good though — here we use dictionaries.
|
||||
|
||||
p.
|
||||
The input data, features, is a set with a member for every non-zero “column”
|
||||
in our “table” – every active feature. Usually this is actually a dictionary,
|
||||
to let you set values for the features. But here all my features are binary
|
||||
present-or-absent type deals.
|
||||
|
||||
p.
|
||||
The weights data-structure is a dictionary of dictionaries, that ultimately
|
||||
associates feature/class pairs with some weight. You want to structure it
|
||||
this way instead of the reverse because of the way word frequencies are
|
||||
distributed: most words are rare, frequent words are very frequent.
|
||||
|
||||
h3 Learning the weights
|
||||
|
||||
p.
|
||||
Okay, so how do we get the values for the weights? We start with an empty
|
||||
weights dictionary, and iteratively do the following:
|
||||
|
||||
ol
|
||||
li Receive a new (features, POS-tag) pair
|
||||
li Guess the value of the POS tag given the current “weights” for the features
|
||||
li If guess is wrong, add +1 to the weights associated with the correct class for these features, and -1 to the weights for the predicted class.
|
||||
|
||||
|
||||
p.
|
||||
It’s one of the simplest learning algorithms. Whenever you make a mistake,
|
||||
increment the weights for the correct class, and penalise the weights that
|
||||
led to your false prediction. In code:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def train(self, nr_iter, examples):
|
||||
| for i in range(nr_iter):
|
||||
| for features, true_tag in examples:
|
||||
| guess = self.predict(features)
|
||||
| if guess != true_tag:
|
||||
| for f in features:
|
||||
| self.weights[f][true_tag] += 1
|
||||
| self.weights[f][guess] -= 1
|
||||
| random.shuffle(examples)
|
||||
p.
|
||||
If you iterate over the same example this way, the weights for the correct
|
||||
class would have to come out ahead, and you’d get the example right. If
|
||||
you think about what happens with two examples, you should be able to
|
||||
see that it will get them both right unless the features are identical.
|
||||
In general the algorithm will converge so long as the examples are
|
||||
linearly separable, although that doesn’t matter for our purpose.
|
||||
|
||||
h3 Averaging the weights
|
||||
|
||||
p.
|
||||
We need to do one more thing to make the perceptron algorithm competitive.
|
||||
The problem with the algorithm so far is that if you train it twice on
|
||||
slightly different sets of examples, you end up with really different models.
|
||||
It doesn’t generalise that smartly. And the problem is really in the later
|
||||
iterations — if you let it run to convergence, it’ll pay lots of attention
|
||||
to the few examples it’s getting wrong, and mutate its whole model around
|
||||
them.
|
||||
|
||||
p.
|
||||
So, what we’re going to do is make the weights more "sticky" – give
|
||||
the model less chance to ruin all its hard work in the later rounds. And
|
||||
we’re going to do that by returning the averaged weights, not the final
|
||||
weights.
|
||||
|
||||
p.
|
||||
I doubt there are many people who are convinced that’s the most obvious
|
||||
solution to the problem, but whatever. We’re not here to innovate, and this
|
||||
way is time tested on lots of problems. If you have another idea, run the
|
||||
experiments and tell us what you find. Actually I’d love to see more work
|
||||
on this, now that the averaged perceptron has become such a prominent learning
|
||||
algorithm in NLP.
|
||||
|
||||
p.
|
||||
Okay. So this averaging. How’s that going to work? Note that we don’t want
|
||||
to just average after each outer-loop iteration. We want the average of all
|
||||
the values — from the inner loop. So if we have 5,000 examples, and we train
|
||||
for 10 iterations, we’ll average across 50,000 values for each weight.
|
||||
|
||||
p.
|
||||
Obviously we’re not going to store all those intermediate values. Instead,
|
||||
we’ll track an accumulator for each weight, and divide it by the number of
|
||||
iterations at the end. Again: we want the average weight assigned to a
|
||||
feature/class pair during learning, so the key component we need is the total
|
||||
weight it was assigned. But we also want to be careful about how we compute
|
||||
that accumulator, too. On almost any instance, we’re going to see a tiny
|
||||
fraction of active feature/class pairs. All the other feature/class weights
|
||||
won’t change. So we shouldn’t have to go back and add the unchanged value
|
||||
to our accumulators anyway, like chumps.
|
||||
|
||||
p.
|
||||
Since we’re not chumps, we’ll make the obvious improvement. We’ll maintain
|
||||
another dictionary that tracks how long each weight has gone unchanged. Now
|
||||
when we do change a weight, we can do a fast-forwarded update to the accumulator,
|
||||
for all those iterations where it lay unchanged.
|
||||
|
||||
p.
|
||||
Here’s what a weight update looks like now that we have to maintain the
|
||||
totals and the time-stamps:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def update(self, truth, guess, features):
|
||||
| def upd_feat(c, f, v):
|
||||
| nr_iters_at_this_weight = self.i - self._timestamps[f][c]
|
||||
| self._totals[f][c] += nr_iters_at_this_weight * self.weights[f][c]
|
||||
| self.weights[f][c] += v
|
||||
| self._timestamps[f][c] = self.i
|
||||
|
||||
| self.i += 1
|
||||
| for f in features:
|
||||
| upd_feat(truth, f, 1.0)
|
||||
| upd_feat(guess, f, -1.0)
|
||||
|
||||
h3 Features and pre-processing
|
||||
|
||||
p.
|
||||
The POS tagging literature has tonnes of intricate features sensitive to
|
||||
case, punctuation, etc. They help on the standard test-set, which is from
|
||||
Wall Street Journal articles from the 1980s, but I don’t see how they’ll
|
||||
help us learn models that are useful on other text.
|
||||
|
||||
p.
|
||||
To help us learn a more general model, we’ll pre-process the data prior
|
||||
to feature extraction, as follows:
|
||||
|
||||
ul
|
||||
li All words are lower cased;
|
||||
li Digits in the range 1800-2100 are represented as !YEAR;
|
||||
li Other digit strings are represented as !DIGITS
|
||||
li
|
||||
| It would be better to have a module recognising dates, phone numbers,
|
||||
| emails, hash-tags, etc. but that will have to be pushed back into the
|
||||
| tokenization.
|
||||
|
||||
p.
|
||||
I played around with the features a little, and this seems to be a reasonable
|
||||
bang-for-buck configuration in terms of getting the development-data accuracy
|
||||
to 97% (where it typically converges anyway), and having a smaller memory
|
||||
foot-print:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def _get_features(self, i, word, context, prev, prev2):
|
||||
| '''Map tokens-in-contexts into a feature representation, implemented as a
|
||||
| set. If the features change, a new model must be trained.'''
|
||||
| def add(name, *args):
|
||||
| features.add('+'.join((name,) + tuple(args)))
|
||||
|
||||
| features = set()
|
||||
| add('bias') # This acts sort of like a prior
|
||||
| add('i suffix', word[-3:])
|
||||
| add('i pref1', word[0])
|
||||
| add('i-1 tag', prev)
|
||||
| add('i-2 tag', prev2)
|
||||
| add('i tag+i-2 tag', prev, prev2)
|
||||
| add('i word', context[i])
|
||||
| add('i-1 tag+i word', prev, context[i])
|
||||
| add('i-1 word', context[i-1])
|
||||
| add('i-1 suffix', context[i-1][-3:])
|
||||
| add('i-2 word', context[i-2])
|
||||
| add('i+1 word', context[i+1])
|
||||
| add('i+1 suffix', context[i+1][-3:])
|
||||
| add('i+2 word', context[i+2])
|
||||
| return features
|
||||
|
||||
p.
|
||||
I haven’t added any features from external data, such as case frequency
|
||||
statistics from the Google Web 1T corpus. I might add those later, but for
|
||||
now I figured I’d keep things simple.
|
||||
|
||||
h3 What about search?
|
||||
|
||||
p.
|
||||
The model I’ve recommended commits to its predictions on each word, and
|
||||
moves on to the next one. Those predictions are then used as features for
|
||||
the next word. There’s a potential problem here, but it turns out it doesn’t
|
||||
matter much. It’s easy to fix with beam-search, but I say it’s not really
|
||||
worth bothering. And it definitely doesn’t matter enough to adopt a slow
|
||||
and complicated algorithm like Conditional Random Fields.
|
||||
|
||||
p.
|
||||
Here’s the problem. The best indicator for the tag at position, say, 3 in
|
||||
a sentence is the word at position 3. But the next-best indicators are the
|
||||
tags at positions 2 and 4. So there’s a chicken-and-egg problem: we want
|
||||
the predictions for the surrounding words in hand before we commit to a
|
||||
prediction for the current word. Here’s an example where search might matter:
|
||||
|
||||
p.example.
|
||||
Their management plan reforms worked
|
||||
|
||||
p.
|
||||
Depending on just what you’ve learned from your training data, you can
|
||||
imagine making a different decision if you started at the left and moved
|
||||
right, conditioning on your previous decisions, than if you’d started at
|
||||
the right and moved left.
|
||||
|
||||
p.
|
||||
If that’s not obvious to you, think about it this way: “worked” is almost
|
||||
surely a verb, so if you tag “reforms” with that in hand, you’ll have a
|
||||
different idea of its tag than if you’d just come from “plan“, which you
|
||||
might have regarded as either a noun or a verb.
|
||||
|
||||
p.
|
||||
Search can only help you when you make a mistake. It can prevent that error
|
||||
from throwing off your subsequent decisions, or sometimes your future choices
|
||||
will correct the mistake. And that’s why for POS tagging, search hardly matters!
|
||||
Your model is so good straight-up that your past predictions are almost always
|
||||
true. So you really need the planets to align for search to matter at all.
|
||||
|
||||
p.
|
||||
And as we improve our taggers, search will matter less and less. Instead
|
||||
of search, what we should be caring about is multi-tagging. If we let the
|
||||
model be a bit uncertain, we can get over 99% accuracy assigning an average
|
||||
of 1.05 tags per word (Vadas et al, ACL 2006). The averaged perceptron is
|
||||
rubbish at multi-tagging though. That’s its big weakness. You really want
|
||||
a probability distribution for that.
|
||||
|
||||
p.
|
||||
One caveat when doing greedy search, though. It’s very important that your
|
||||
training data model the fact that the history will be imperfect at run-time.
|
||||
Otherwise, it will be way over-reliant on the tag-history features. Because
|
||||
the Perceptron is iterative, this is very easy.
|
||||
|
||||
p.
|
||||
Here’s the training loop for the tagger:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def train(self, sentences, save_loc=None, nr_iter=5, quiet=False):
|
||||
| '''Train a model from sentences, and save it at save_loc. nr_iter
|
||||
| controls the number of Perceptron training iterations.'''
|
||||
| self._make_tagdict(sentences, quiet=quiet)
|
||||
| self.model.classes = self.classes
|
||||
| prev, prev2 = START
|
||||
| for iter_ in range(nr_iter):
|
||||
| c = 0; n = 0
|
||||
| for words, tags in sentences:
|
||||
| context = START + [self._normalize(w) for w in words] + END
|
||||
| for i, word in enumerate(words):
|
||||
| guess = self.tagdict.get(word)
|
||||
| if not guess:
|
||||
| feats = self._get_features(
|
||||
| i, word, context, prev, prev2)
|
||||
| guess = self.model.predict(feats)
|
||||
| self.model.update(tags[i], guess, feats)
|
||||
| # Set the history features from the guesses, not the
|
||||
| # true tags
|
||||
| prev2 = prev; prev = guess
|
||||
| c += guess == tags[i]; n += 1
|
||||
| random.shuffle(sentences)
|
||||
| if not quiet:
|
||||
| print("Iter %d: %d/%d=%.3f" % (iter_, c, n, _pc(c, n)))
|
||||
| self.model.average_weights()
|
||||
| # Pickle as a binary file
|
||||
| if save_loc is not None:
|
||||
| cPickle.dump((self.model.weights, self.tagdict, self.classes),
|
||||
| open(save_loc, 'wb'), -1)
|
||||
p.
|
||||
Unlike the previous snippets, this one’s literal – I tended to edit the
|
||||
previous ones to simplify. So if they have bugs, hopefully that’s why!
|
||||
|
||||
p.
|
||||
At the time of writing, I’m just finishing up the implementation before I
|
||||
submit a pull request to TextBlob. You can see the rest of the source here:
|
||||
|
||||
ul
|
||||
li
|
||||
a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/taggers.py") taggers.py
|
||||
li
|
||||
a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/_perceptron.py") _perceptron.py
|
||||
|
||||
h3 A final comparison…
|
||||
|
||||
p.
|
||||
Over the years I’ve seen a lot of cynicism about the WSJ evaluation methodology.
|
||||
The claim is that we’ve just been meticulously over-fitting our methods to this
|
||||
data. Actually the evidence doesn’t really bear this out. Mostly, if a technique
|
||||
is clearly better on one evaluation, it improves others as well. Still, it’s
|
||||
very reasonable to want to know how these tools perform on other text. So I
|
||||
ran the unchanged models over two other sections from the OntoNotes corpus:
|
||||
|
||||
table
|
||||
thead
|
||||
tr
|
||||
th Tagger
|
||||
th WSJ
|
||||
th ABC
|
||||
th Web
|
||||
tbody
|
||||
tr
|
||||
td Pattern
|
||||
td 93.5
|
||||
td 90.7
|
||||
td 88.1
|
||||
tr
|
||||
td NLTK
|
||||
td 94.0
|
||||
td 91.5
|
||||
td 88.4
|
||||
tr
|
||||
td PyGreedyAP
|
||||
td 96.8
|
||||
td 94.8
|
||||
td 91.8
|
||||
|
||||
p.
|
||||
The ABC section is broadcast news, Web is text from the web (blogs etc — I haven’t
|
||||
looked at the data much).
|
||||
|
||||
p.
|
||||
As you can see, the order of the systems is stable across the three comparisons,
|
||||
and the advantage of our Averaged Perceptron tagger over the other two is real
|
||||
enough. Actually the pattern tagger does very poorly on out-of-domain text.
|
||||
It mostly just looks up the words, so it’s very domain dependent. I hadn’t
|
||||
realised it before, but it’s obvious enough now that I think about it.
|
||||
|
||||
p.
|
||||
We can improve our score greatly by training on some of the foreign data.
|
||||
The technique described in this paper (Daume III, 2007) is the first thing
|
||||
I try when I have to do that.
|
||||
|
||||
|
||||
footer.meta(role='contentinfo')
|
||||
a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter
|
||||
.discuss
|
||||
a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News
|
||||
|
|
||||
a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit
|
|
@ -1,139 +0,0 @@
|
|||
- var urls = {}
|
||||
- urls.choi_paper = "http://aclweb.org/anthology/P/P15/P15-1038.pdf"
|
||||
- urls.emnlp_paper = "honnibal_johnson_emnlp2015.pdf"
|
||||
|
||||
|
||||
+comparison("NLTK")
|
||||
p spaCy is:
|
||||
ul
|
||||
li.pro 100x faster;
|
||||
li.pro 50% more accurate;
|
||||
li.pro Serializes TODO% smaller;
|
||||
|
||||
p spaCy features:
|
||||
ul
|
||||
li.pro Integrated word vectors;
|
||||
li.pro Efficient binary serialization;
|
||||
|
||||
p NLTK features:
|
||||
ul
|
||||
li.con Multiple languages;
|
||||
li.neutral Educational resources
|
||||
|
||||
|
||||
//+comparison("Pattern")
|
||||
+comparison("CoreNLP")
|
||||
p spaCy is:
|
||||
|
||||
ul
|
||||
li.pro TODO% faster;
|
||||
li.pro TODO% more accurate;
|
||||
li.pro Not Java;
|
||||
li.pro Well documented;
|
||||
li.pro Cheaper to license commercially;
|
||||
li.neutral
|
||||
| Opinionated/Minimalist. spaCy avoids providing redundant or overlapping
|
||||
| options.
|
||||
|
||||
p CoreNLP features:
|
||||
|
||||
ul
|
||||
li.con Multiple Languages;
|
||||
li.con Sentiment analysis
|
||||
li.con Coreference resolution
|
||||
|
||||
|
||||
+comparison("ClearNLP")
|
||||
p spaCy is:
|
||||
|
||||
ul
|
||||
li.pro Not Java;
|
||||
li.pro TODO% faster;
|
||||
li.pro Well documented;
|
||||
li.neutral Slightly more accurate;
|
||||
|
||||
p ClearNLP features:
|
||||
|
||||
ul
|
||||
li.con Semantic Role Labelling
|
||||
li.con Multiple Languages
|
||||
li.con Model for biology/life-science;
|
||||
|
||||
//+comparison("Accuracy Summary")
|
||||
|
||||
//+comparison("Speed Summary")
|
||||
// table
|
||||
// thead
|
||||
// tr
|
||||
// th.
|
||||
// th(colspan=3) Absolute (ms per doc)
|
||||
// th(colspan=3) Relative (to spaCy)
|
||||
//
|
||||
// tbody
|
||||
// tr
|
||||
// td: strong System
|
||||
// td: strong Split
|
||||
// td: strong Tag
|
||||
// td: strong Parse
|
||||
// td: strong Split
|
||||
// td: strong Tag
|
||||
// td: strong Parse
|
||||
//
|
||||
// +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
|
||||
// +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
|
||||
// +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x")
|
||||
// +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x")
|
||||
// +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a")
|
||||
//
|
||||
// p
|
||||
// | <strong>Set up</strong>: 100,000 plain-text documents were streamed
|
||||
// | from an SQLite3 database, and processed with an NLP library, to one
|
||||
// | of three levels of detail – tokenization, tagging, or parsing.
|
||||
// | The tasks are additive: to parse the text you have to tokenize and
|
||||
// | tag it. The pre-processing was not subtracted from the times –
|
||||
// | I report the time required for the pipeline to complete. I report
|
||||
// | mean times per document, in milliseconds.
|
||||
//
|
||||
// p
|
||||
// | <strong>Hardware</strong>: Intel i7-3770 (2012)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
+comparison("Peer-reviewed Evaluations")
|
||||
p.
|
||||
spaCy is committed to rigorous evaluation under standard methodology. Two
|
||||
papers in 2015 confirm that:
|
||||
ol
|
||||
li spaCy is the fastest syntactic parser in the world;
|
||||
li Its accuracy is within 1% of the best available;
|
||||
li The few systems that are more accurate are 20× slower or more.
|
||||
|
||||
p
|
||||
| spaCy v0.84 was evaluated by researchers at Yahoo! Labs and Emory University,
|
||||
| as part of a survey paper benchmarking the current state-of-the-art dependency
|
||||
| parsers
|
||||
a(href=urls.choi_paper) (Choi et al., 2015)
|
||||
| .
|
||||
|
||||
table
|
||||
thead
|
||||
+columns("System", "Language", "Accuracy", "Speed")
|
||||
|
||||
tbody
|
||||
+row("spaCy v0.84", "Cython", "90.6", "13,963")
|
||||
+row("spaCy v0.89", "Cython", "91.8", "13,000 (est.)")
|
||||
+row("ClearNLP", "Java", "91.7", "10,271")
|
||||
+row("CoreNLP", "Java", "89.6", "8,602")
|
||||
+row("MATE", "Java", "92.5", "550")
|
||||
+row("Turbo", "C++", "92.4", "349")
|
||||
+row("Yara", "Java", "92.3", "340")
|
||||
|
||||
p
|
||||
| Discussion with the authors led to accuracy improvements in spaCy, which
|
||||
| have been accepted for publication in EMNLP, in joint work with Macquarie
|
||||
| University
|
||||
a(href=urls.emnlp_paper) (Honnibal and Johnson, 2015)
|
||||
| .
|
||||
|
|
@ -1,129 +0,0 @@
|
|||
extends ./outline.jade
|
||||
|
||||
include ./mixins.jade
|
||||
|
||||
|
||||
mixin declare_class(name)
|
||||
details
|
||||
summary
|
||||
span.declaration
|
||||
span.label class
|
||||
code #{name}
|
||||
block
|
||||
|
||||
mixin method(name, parameters)
|
||||
details(open=attributes.open)
|
||||
summary
|
||||
span.declaration
|
||||
span.label #{name}
|
||||
span.parameters
|
||||
| self, #{parameters}
|
||||
block
|
||||
|
||||
|
||||
mixin params
|
||||
ul
|
||||
block
|
||||
|
||||
|
||||
mixin param(name, type, value)
|
||||
li
|
||||
if type
|
||||
<strong>#{name}</strong> (!{type}) –
|
||||
else
|
||||
<strong>#{name}</strong> –
|
||||
block
|
||||
|
||||
|
||||
mixin attribute(name, type, value)
|
||||
details(open=attributes.open)
|
||||
summary
|
||||
span.declaration
|
||||
span.label #{name}
|
||||
block
|
||||
|
||||
|
||||
mixin returns(name, type, value)
|
||||
li
|
||||
if type
|
||||
<strong>#{name}</strong> (!{type}) –
|
||||
else
|
||||
<strong>#{name}</strong> –
|
||||
block
|
||||
|
||||
|
||||
mixin returns(type)
|
||||
| tmp
|
||||
|
||||
mixin init
|
||||
details
|
||||
summary: h4 Init
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin callable
|
||||
details
|
||||
summary: h4 Callable
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin sequence
|
||||
details
|
||||
summary: h4 Sequence
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin maptype
|
||||
details
|
||||
summary: h4 Map
|
||||
|
||||
block
|
||||
|
||||
|
||||
mixin summary
|
||||
block
|
||||
|
||||
mixin en_example
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| from spacy._doc_examples import download_war_and_peace
|
||||
|
|
||||
| unprocessed_unicode = download_war_and_peace()
|
||||
|
|
||||
| nlp = English()
|
||||
| doc = nlp(unprocessed_unicode)
|
||||
|
||||
|
||||
block intro_block
|
||||
section(class="intro")
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="#api" class="button") API
|
||||
li: a(href="#tutorials" class="button") Tutorials
|
||||
li: a(href="#spec" class="button") Spec
|
||||
|
||||
|
||||
block body_block
|
||||
- var py_docs = '<a class="reference" href="http://docs.python.org/library/'
|
||||
|
||||
-
|
||||
var types = {
|
||||
'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
|
||||
'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
|
||||
'int': py_docs + 'functions.html#int"><em>int</em></a>',
|
||||
'generator': "",
|
||||
'Vocab': "",
|
||||
'Span': "",
|
||||
'Doc': ""
|
||||
}
|
||||
|
||||
article
|
||||
|
||||
+Section("API", "api", "api.jade")
|
||||
+Section("Tutorials", "tutorials", "tutorials.jade")
|
||||
+Section("Annotation Specifications", "spec", "spec.jade")
|
|
@ -1,88 +0,0 @@
|
|||
extends ./outline.jade
|
||||
|
||||
include ./mixins.jade
|
||||
|
||||
// Notes
|
||||
//
|
||||
// 1. Where to put version notice? Should say something like
|
||||
// 2015-08-12: v0.89
|
||||
// and be a link
|
||||
//
|
||||
// Only needs to appear on home page.
|
||||
|
||||
|
||||
- var slogan = "Build Tomorrow's Language Technologies"
|
||||
- var tag_line = "spaCy – " + slogan
|
||||
|
||||
mixin lede
|
||||
- var state_of_the_art = '<a href="#">state-of-the-art</a>'
|
||||
- var a_minor_miracle = '<a href="">a minor miracle</a>'
|
||||
- var great_documentation = '<a href="">great documentation</a>'
|
||||
- var concise_API = '<a href="">concise API</a>'
|
||||
|
||||
p.
|
||||
<a href="https://github.com/honnibal/spaCy"><strong>spaCy</strong></a> is a
|
||||
library for industrial-strength natural language processing in Python and
|
||||
Cython. It features !{state_of_the_art} speed and accuracy, a !{concise_API},
|
||||
and <a href="#license">license terms</a> designed to get out of your way.
|
||||
If you're a small company doing NLP, we want <strong>spaCy</strong> to seem
|
||||
like !{a_minor_miracle}.
|
||||
|
||||
|
||||
mixin comparison(name)
|
||||
details
|
||||
summary
|
||||
h4= name
|
||||
|
||||
block
|
||||
|
||||
mixin columns(...names)
|
||||
tr
|
||||
each name in names
|
||||
th= name
|
||||
|
||||
|
||||
mixin row(...cells)
|
||||
tr
|
||||
each cell in cells
|
||||
td= cell
|
||||
|
||||
|
||||
mixin social
|
||||
footer(role="contentinfo")
|
||||
a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter
|
||||
|
||||
div.discuss
|
||||
a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn")
|
||||
| Discuss on Hacker News
|
||||
|
||||
a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit")
|
||||
| Discuss on Reddit
|
||||
|
||||
|
||||
block intro_block
|
||||
section(class="intro")
|
||||
+lede
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="#example-use" class="button") Examples
|
||||
li: a(href="#comparisons" class="button") Comparisons
|
||||
li: a(href="#online-demo" class="button") Try Online
|
||||
li: a(href="#install" class="button")
|
||||
| Install
|
||||
<span class="button-caption">v0.89</span>
|
||||
|
||||
|
||||
|
||||
block body_block
|
||||
article(class="page landing-page")
|
||||
|
||||
+Section("Usage by Example", "example-use", "./usage_examples.jade")
|
||||
|
||||
+Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade")
|
||||
|
||||
+Section("Online Demo", "online-demo", "./online_demo.jade")
|
||||
|
||||
|
||||
+Section("Install", "install", "./install.jade")
|
|
@ -1,71 +0,0 @@
|
|||
mixin Option(name, open)
|
||||
details(open=open)
|
||||
summary
|
||||
h4= name
|
||||
block
|
||||
|
||||
+Option("conda", true)
|
||||
pre.language-bash: code
|
||||
| $ conda install spacy
|
||||
| $ python -m spacy.en.download
|
||||
|
||||
+Option("pip and virtualenv", true)
|
||||
p With Python 2.7 or Python 3, using Linux or OSX, run:
|
||||
|
||||
pre.language-bash: code
|
||||
| $ pip install spacy
|
||||
| $ python -m spacy.en.download
|
||||
|
||||
p
|
||||
| The download command fetches and installs about 300mb of data, for
|
||||
| the parser model and word vectors, which it installs within the spacy.en
|
||||
| package directory.
|
||||
|
||||
|
||||
+Option("Workaround for obsolete system Python", false)
|
||||
p
|
||||
| If you're stuck using a server with an old version of Python, and you
|
||||
| don't have root access, I've prepared a bootstrap script to help you
|
||||
| compile a local Python install. Run:
|
||||
|
||||
pre.language-bash: code
|
||||
| $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
|
||||
|
||||
|
||||
|
||||
+Option("Compile from source", false)
|
||||
p
|
||||
| The other way to install the package is to clone the github repository,
|
||||
| and build it from source. This installs an additional dependency,
|
||||
| Cython. If you're using Python 2, I also recommend installing fabric
|
||||
| and fabtools – this is how I build the project.
|
||||
|
||||
pre.language-bash: code
|
||||
| $ git clone https://github.com/honnibal/spaCy.git
|
||||
| $ cd spaCy
|
||||
| $ virtualenv .env && source .env/bin/activate
|
||||
| $ export PYTHONPATH=`pwd`
|
||||
| $ pip install -r requirements.txt
|
||||
| $ python setup.py build_ext --inplace
|
||||
| $ python -m spacy.en.download
|
||||
| $ pip install pytest
|
||||
| $ py.test tests/
|
||||
|
||||
p
|
||||
| Python packaging is awkward at the best of times, and it's particularly tricky
|
||||
| with C extensions, built via Cython, requiring large data files. So,
|
||||
| please report issues as you encounter them.
|
||||
|
||||
+Option("pypy (Unsupported)")
|
||||
| If PyPy support is a priority for you, please get in touch. We could likely
|
||||
| fix the remaining issues, if necessary. However, the library is likely to
|
||||
| be much slower on PyPy, as it's written in Cython, which produces code tuned
|
||||
| for the performance of CPython.
|
||||
|
||||
+Option("Windows (Unsupported)")
|
||||
| Unfortunately we don't currently have access to a Windows machine, and have
|
||||
| no experience developing on a MicroSoft stack. In theory the only problems are
|
||||
| with the installation and packaging – there should be no deep platform
|
||||
| dependency. Unfortunately we can't debug these issues at present, simply due
|
||||
| to lack of a development environment.
|
||||
|
|
@ -1,179 +0,0 @@
|
|||
extends ./outline.jade
|
||||
|
||||
mixin columns(...names)
|
||||
tr
|
||||
each name in names
|
||||
th= name
|
||||
|
||||
|
||||
mixin row(...cells)
|
||||
tr
|
||||
each cell in cells
|
||||
td= cell
|
||||
|
||||
|
||||
mixin LicenseOption(name, period, price, audience)
|
||||
.item
|
||||
h4 #{name}
|
||||
|
||||
.focus #{period}
|
||||
|
||||
span #{price}
|
||||
|
||||
h5 Suggested for:
|
||||
|
||||
span #{audience}
|
||||
|
||||
a.button(href="spacy_trial_free.docx") Download license
|
||||
|
||||
span or
|
||||
a(href="#") get in touch
|
||||
|
||||
|
||||
block body_block
|
||||
article.pricing
|
||||
|
||||
.box.license
|
||||
+LicenseOption("Trial", "90 days", "$0", "Evaluation")
|
||||
+LicenseOption("Production", "1 year", "$5,000", "Production")
|
||||
+LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning")
|
||||
|
||||
p.caption
|
||||
| Researcher, hobbyist, or open-source developer? spaCy also offers
|
||||
a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3
|
||||
| licenses.
|
||||
|
||||
p.
|
||||
What we offer is a rare, simple certainty: a long-term, permissive license
|
||||
that comes with full access to the source, complete transparency, and almost
|
||||
complete flexibility. The difference between this and a black-box API is
|
||||
night and day. You cannot build a great product against a service you
|
||||
don't understand, and you can't build a great business on a service you
|
||||
don't control.
|
||||
|
||||
p
|
||||
| Let's face it: services disappear. Constantly. The good start-ups get
|
||||
| bought; the bad ones go bankrupt. Open-source projects become abandoned
|
||||
| or bloated. Google's graveyard is over-flowing – ditto for Yahoo!,
|
||||
| Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset?
|
||||
|
||||
p
|
||||
| A 5 year license won't expire until 2020. spaCy will be with you for
|
||||
| longer than most of your current staff. If that's still not enough,
|
||||
| get in touch. I'm sure we can work something out.
|
||||
|
||||
//p.
|
||||
// To make spaCy as valuable as possible, licenses to it are for life. You get
|
||||
// complete transparency, certainty and control. If you need to use spaCy
|
||||
// as an API, it's trivial to host it yourself – and you don't need to
|
||||
// worry about the service changing or disappearing. And if you're ever in
|
||||
// acquisition or IPO talks, the story is simple.
|
||||
|
||||
//p.
|
||||
// spaCy can also be used as free open-source software, under the Aferro GPL
|
||||
// license. If you use it this way, you must comply with the AGPL license
|
||||
// terms. When you distribute your project, or offer it as a network service,
|
||||
// you must distribute the source-code and grant users an AGPL license to it.
|
||||
|
||||
|
||||
//h3 Examples
|
||||
|
||||
//p.
|
||||
// In order to clarify how spaCy's license structure might apply to you, I've
|
||||
// written a few examples, in the form of user-stories.
|
||||
|
||||
//details
|
||||
// summary: h4 Seed stage start-ups
|
||||
|
||||
// p.
|
||||
// Ashley and Casey have an idea for a start-up. To explore their idea, they
|
||||
// want to build a minimum viable product they can put in front of potential
|
||||
// users and investors.
|
||||
|
||||
// p. They have two options.
|
||||
|
||||
// ol
|
||||
// li
|
||||
// p.
|
||||
// <strong>Trial commercial license.</strong> With a simple form, they can
|
||||
// use spaCy for 90 days, for a nominal fee of $1. They are free to modify
|
||||
// spaCy, and they will own the copyright to their modifications for the
|
||||
// duration of the license. After the trial period elapses, they can either
|
||||
// pay the license fee, stop using spaCy, release their project under the
|
||||
// AGPL.
|
||||
//
|
||||
// li
|
||||
// p.
|
||||
// <strong>AGPL.</strong> Casey and Pat can instead use spaCy under the AGPL
|
||||
// license. However, they must then release any code that statically or
|
||||
// dynamically links to spaCy under the AGPL as well (e.g. if they import
|
||||
// the module, or import a module that imports it, etc). They also cannot
|
||||
// use spaCy as a network resource, by running it as a service --- this is
|
||||
// the loophole that the "A" part of the AGPL is designed to close.
|
||||
//
|
||||
// p.
|
||||
// Ashley and Casey find the AGPL license unattractive for commercial use.
|
||||
// They decide to take up the trial commercial license. However, over the
|
||||
// next 90 days, Ashley has to move house twice, and Casey gets sick. By
|
||||
// the time the trial expires, they still don't have a demo they can show
|
||||
// investors. They send an email explaining the situation, and a 90 day extension
|
||||
// to their trial license is granted.
|
||||
|
||||
// p.
|
||||
// By the time the extension period has elapsed, spaCy has helped them secure
|
||||
// funding, and they even have a little revenue. They are glad to pay the
|
||||
// $5,000 commercial license fee.
|
||||
|
||||
// p.
|
||||
// spaCy is now permanently licensed for the product Ashley and Casey are
|
||||
// developing. They own the copyright to any modifications they make to spaCy,
|
||||
// but not to the original spaCy code.
|
||||
|
||||
// p.
|
||||
// No additional fees will be due when they hire new developers, run spaCy on
|
||||
// additional internal servers, etc. If their company is acquired, the license
|
||||
// will be transferred to the company acquiring them. However, to use spaCy
|
||||
// in another product, they will have to buy a second license.
|
||||
|
||||
|
||||
// details
|
||||
// summary: h4 University academics
|
||||
|
||||
// p.
|
||||
// Alex and Sasha are post-doctoral researchers working for a university.
|
||||
// Part of their funding comes from a grant from Google, but Google will not
|
||||
// own any part of the work that they produce. Their mission is just to write
|
||||
// papers.
|
||||
|
||||
// p.
|
||||
// Alex and Sasha find spaCy convenient, so they use it in their system under
|
||||
// the AGPL. This means that their system must also be released under the
|
||||
// AGPL, but they're cool with that – they were going to release their
|
||||
// code anyway, as it's the only way to ensure their experiments are properly
|
||||
// repeatable.
|
||||
|
||||
// p.
|
||||
// Alex and Sasha find and fix a few bugs in spaCy. They must release these
|
||||
// modifications, and they ask that they be accepted into the main spaCy repo.
|
||||
// In order to do this, they must sign a contributor agreement, ceding their
|
||||
// copyright. When commercial licenses to spaCy are sold, Alex and Sasha will
|
||||
// not be able to claim any royalties from their contributions.
|
||||
|
||||
// p.
|
||||
// Later, Alex and Sasha implement new features into spaCy, for another paper.
|
||||
// The code was quite rushed, and they don't want to take the time to put
|
||||
// together a proper pull request. They must release their modifications
|
||||
// under the AGPL, but they are not obliged to contribute it to the spaCy
|
||||
// repository, or concede their copyright.
|
||||
|
||||
// details
|
||||
// summary: h4 Open Source developers
|
||||
|
||||
// p.
|
||||
// Phuong and Jessie use the open-source software Calibre to manage their
|
||||
// e-book libraries. They have an idea for a search feature, and they want
|
||||
// to use spaCy to implement it. Calibre is released under the GPLv3. The
|
||||
// AGPL has additional restrictions for projects used as a network resource,
|
||||
// but they don't apply to this project, so Phuong and Jessie can use spaCy
|
||||
// to improve Calibre. They'll have to release their code, but that was
|
||||
// always their intention anyway.
|
|
@ -1,17 +0,0 @@
|
|||
mixin Section(title_text, link_name, include_file)
|
||||
h3: a(name=link_name) #{title_text}
|
||||
|
||||
if (link_name == "example-use")
|
||||
include ./usage_examples.jade
|
||||
else if (link_name == "online-demo")
|
||||
include ./online_demo.jade
|
||||
else if (link_name == "comparisons")
|
||||
include ./comparisons.jade
|
||||
else if (link_name == "install")
|
||||
include ./installation.jade
|
||||
else if (link_name == "api")
|
||||
include ./api.jade
|
||||
else if (link_name == "tutorials")
|
||||
include ./tutorials.jade
|
||||
else if (link_name == "spec")
|
||||
include ./spec.jade
|
|
@ -1,18 +0,0 @@
|
|||
mixin Displacy(sentence, caption_text, height)
|
||||
- var url = "http://ines.io/displacy/?full=" + sentence.replace(" ", "%20")
|
||||
|
||||
.displacy
|
||||
iframe.displacy(src="displacy/displacy_demo.html" height=height)
|
||||
|
||||
a.view-displacy(href=url)
|
||||
| Interactive Visualizer
|
||||
|
||||
p.caption.
|
||||
#{caption_text}
|
||||
|
||||
|
||||
+Displacy(
|
||||
"Click the button to see this sentence in displaCy.",
|
||||
"The best parse-tree visualizer and annotation tool in all the land.",
|
||||
275
|
||||
)
|
|
@ -1,37 +0,0 @@
|
|||
- var slogan = "Build Tomorrow's Language Technologies"
|
||||
- var tag_line = "spaCy – " + slogan
|
||||
|
||||
|
||||
doctype html
|
||||
html(lang="en")
|
||||
head
|
||||
meta(charset="utf-8")
|
||||
title!= tag_line
|
||||
meta(name="description" content="")
|
||||
meta(name="author" content="Matthew Honnibal")
|
||||
link(rel="stylesheet" href="css/style.css")
|
||||
<!--[if lt IE 9]>
|
||||
script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js")
|
||||
<![endif]-->
|
||||
|
||||
body(id="home" role="document")
|
||||
header(role="banner")
|
||||
h1(class="logo")!= tag_line
|
||||
div(class="slogan")!= slogan
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="home.html") Home
|
||||
li: a(href="docs.html") Docs
|
||||
li: a(href="license.html") License
|
||||
li: a(href="blog.html") Blog
|
||||
|
||||
main(id="content" role="main")
|
||||
block intro_block
|
||||
|
||||
block body_block
|
||||
|
||||
footer(role="contentinfo")
|
||||
|
||||
script(src="js/prism.js")
|
||||
script(src="js/details_polyfill.js")
|
|
@ -1,129 +0,0 @@
|
|||
mixin columns(...names)
|
||||
tr
|
||||
each name in names
|
||||
th= name
|
||||
|
||||
|
||||
mixin row(...cells)
|
||||
tr
|
||||
each cell in cells
|
||||
td= cell
|
||||
|
||||
|
||||
details
|
||||
summary: h4 Overview
|
||||
|
||||
p.
|
||||
This document describes the target annotations spaCy is trained to predict.
|
||||
This is currently a work in progress. Please ask questions on the issue tracker,
|
||||
so that the answers can be integrated here to improve the documentation.
|
||||
|
||||
details
|
||||
summary: h4 Tokenization
|
||||
|
||||
p Tokenization standards are based on the OntoNotes 5 corpus.
|
||||
|
||||
p.
|
||||
The tokenizer differs from most by including tokens for significant
|
||||
whitespace. Any sequence of whitespace characters beyond a single space
|
||||
(' ') is included as a token. For instance:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| nlp = English(parse=False)
|
||||
| tokens = nlp('Some\nspaces and\ttab characters')
|
||||
| print([t.orth_ for t in tokens])
|
||||
|
||||
p Which produces:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters']
|
||||
|
||||
p.
|
||||
The whitespace tokens are useful for much the same reason punctuation is
|
||||
– it's often an important delimiter in the text. By preserving
|
||||
it in the token output, we are able to maintain a simple alignment
|
||||
between the tokens and the original string, and we ensure that no
|
||||
information is lost during processing.
|
||||
|
||||
details
|
||||
summary: h4 Sentence boundary detection
|
||||
|
||||
p.
|
||||
Sentence boundaries are calculated from the syntactic parse tree, so
|
||||
features such as punctuation and capitalisation play an important but
|
||||
non-decisive role in determining the sentence boundaries. Usually this
|
||||
means that the sentence boundaries will at least coincide with clause
|
||||
boundaries, even given poorly punctuated text.
|
||||
|
||||
details
|
||||
summary: h4 Part-of-speech Tagging
|
||||
|
||||
p.
|
||||
The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank
|
||||
tag set. We also map the tags to the simpler Google Universal POS Tag set.
|
||||
|
||||
p.
|
||||
Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124
|
||||
|
||||
details
|
||||
summary: h4 Lemmatization
|
||||
|
||||
p.
|
||||
A "lemma" is the uninflected form of a word. In English, this means:
|
||||
|
||||
ul
|
||||
li Adjectives: The form like "happy", not "happier" or "happiest"
|
||||
li Adverbs: The form like "badly", not "worse" or "worst"
|
||||
li Nouns: The form like "dog", not "dogs"; like "child", not "children"
|
||||
li Verbs: The form like "write", not "writes", "writing", "wrote" or "written"
|
||||
|
||||
p.
|
||||
The lemmatization data is taken from WordNet. However, we also add a
|
||||
special case for pronouns: all pronouns are lemmatized to the special
|
||||
token -PRON-.
|
||||
|
||||
|
||||
details
|
||||
summary: h4 Syntactic Dependency Parsing
|
||||
|
||||
p.
|
||||
The parser is trained on data produced by the ClearNLP converter. Details
|
||||
of the annotation scheme can be found here: http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
|
||||
|
||||
details
|
||||
summary: h4 Named Entity Recognition
|
||||
|
||||
table
|
||||
thead
|
||||
+columns("Entity Type", "Description")
|
||||
|
||||
tbody
|
||||
+row("PERSON", "People, including fictional.")
|
||||
+row("NORP", "Nationalities or religious or political groups.")
|
||||
+row("FACILITY", "Buildings, airports, highways, bridges, etc.")
|
||||
+row("ORG", "Companies, agencies, institutions, etc.")
|
||||
+row("GPE", "Countries, cities, states.")
|
||||
+row("LOC", "Non-GPE locations, mountain ranges, bodies of water.")
|
||||
+row("PRODUCT", "Vehicles, weapons, foods, etc. (Not services")
|
||||
+row("EVENT", "Named hurricanes, battles, wars, sports events, etc.")
|
||||
+row("WORK_OF_ART", "Titles of books, songs, etc.")
|
||||
+row("LAW", "Named documents made into laws")
|
||||
+row("LANGUAGE", "Any named language")
|
||||
|
||||
p The following values are also annotated in a style similar to names:
|
||||
|
||||
table
|
||||
thead
|
||||
+columns("Entity Type", "Description")
|
||||
|
||||
tbody
|
||||
+row("DATE", "Absolute or relative dates or periods")
|
||||
+row("TIME", "Times smaller than a day")
|
||||
+row("PERCENT", 'Percentage (including “%”)')
|
||||
+row("MONEY", "Monetary values, including unit")
|
||||
+row("QUANTITY", "Measurements, as of weight or distance")
|
||||
+row("ORDINAL", 'first", "second"')
|
||||
+row("CARDINAL", "Numerals that do not fall under another type")
|
|
@ -1,31 +0,0 @@
|
|||
doctype html
|
||||
html(lang='en')
|
||||
head
|
||||
meta(charset='utf-8')
|
||||
title spaCy Blog
|
||||
meta(name='description', content='')
|
||||
meta(name='author', content='Matthew Honnibal')
|
||||
link(rel='stylesheet', href='css/style.css')
|
||||
//if lt IE 9
|
||||
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
|
||||
body#blog(role="document")
|
||||
header(role='banner')
|
||||
h1.logo spaCy Blog
|
||||
.slogan Blog
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a(href="home.html") Home
|
||||
li: a(href="docs.html") Docs
|
||||
li.active: a(href="blog.html") Blog
|
||||
li: a(href="license.html") License
|
||||
|
||||
main#content(role='main')
|
||||
block intro_block
|
||||
|
||||
block body_block
|
||||
|
||||
footer(role='contentinfo')
|
||||
|
||||
script(src="js/prism.js")
|
||||
script(src="js/details_polyfill.js")
|
|
@ -1,200 +0,0 @@
|
|||
doctype html
|
||||
html(lang='en')
|
||||
head
|
||||
meta(charset='utf-8')
|
||||
title spaCy Blog
|
||||
meta(name='description', content='')
|
||||
meta(name='author', content='Matthew Honnibal')
|
||||
link(rel='stylesheet', href='css/style.css')
|
||||
//if lt IE 9
|
||||
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
|
||||
body#blog
|
||||
header(role='banner')
|
||||
h1.logo spaCy Blog
|
||||
.slogan Blog
|
||||
main#content(role='main')
|
||||
article.post
|
||||
|
||||
|
||||
:markdown-it
|
||||
# Adverbs
|
||||
|
||||
Let's say you're developing a proofreading tool, or possibly an IDE for
|
||||
writers. You're convinced by Stephen King's advice that `adverbs are
|
||||
not your friend <http://www.brainpickings.org/2013/03/13/stephen-king-on-adverbs/>`_,
|
||||
so you want to **highlight all adverbs**. We'll use one of the examples
|
||||
he finds particularly egregious:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV
|
||||
| >>> # Load the pipeline, and call it with some text.
|
||||
| >>> nlp = spacy.en.English()
|
||||
| >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", tag=True, parse=False)
|
||||
| >>> print u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
|
||||
| u‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
:markdown-it
|
||||
Easy enough --- but the problem is that we've also highlighted "back".
|
||||
While "back" is undoubtedly an adverb, we probably don't want to highlight
|
||||
it. If what we're trying to do is flag dubious stylistic choices, we'll
|
||||
need to refine our logic. It turns out only a certain type of adverb
|
||||
is of interest to us.
|
||||
|
||||
|
||||
:markdown-it
|
||||
There are lots of ways we might do this, depending on just what words
|
||||
we want to flag. The simplest way to exclude adverbs like "back" and
|
||||
"not" is by word frequency: these words are much more common than the
|
||||
prototypical manner adverbs that the style guides are worried about.
|
||||
|
||||
:markdown-it
|
||||
The :py:attr:`Lexeme.prob` and :py:attr:`Token.prob` attribute gives a
|
||||
log probability estimate of the word:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> nlp.vocab[u'back'].prob
|
||||
| -7.403977394104004
|
||||
| >>> nlp.vocab[u'not'].prob
|
||||
| -5.407193660736084
|
||||
| >>> nlp.vocab[u'quietly'].prob
|
||||
| -11.07155704498291
|
||||
|
||||
:markdown-it
|
||||
(The probability estimate is based on counts from a 3 billion word corpus,
|
||||
smoothed using the `Simple Good-Turing`_ method.)
|
||||
|
||||
So we can easily exclude the N most frequent words in English from our
|
||||
adverb marker. Let's try N=1000 for now:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV
|
||||
| >>> nlp = spacy.en.English()
|
||||
| >>> # Find log probability of Nth most frequent word
|
||||
| >>> probs = [lex.prob for lex in nlp.vocab]
|
||||
| >>> probs.sort()
|
||||
| >>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
|
||||
| >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
|
||||
| >>> print u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
|
||||
| ‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
:markdown-it
|
||||
There are lots of other ways we could refine the logic, depending on
|
||||
just what words we want to flag. Let's say we wanted to only flag
|
||||
adverbs that modified words similar to "pleaded". This is easy to do,
|
||||
as spaCy loads a vector-space representation for every word (by default,
|
||||
the vectors produced by `Levy and Goldberg (2014)`_). Naturally, the
|
||||
vector is provided as a numpy array:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> pleaded = tokens[7]
|
||||
| >>> pleaded.repvec.shape
|
||||
| (300,)
|
||||
| >>> pleaded.repvec[:5]
|
||||
| array([ 0.04229792, 0.07459262, 0.00820188, -0.02181299, 0.07519238], dtype=float32)
|
||||
|
||||
:markdown-it
|
||||
We want to sort the words in our vocabulary by their similarity to
|
||||
"pleaded". There are lots of ways to measure the similarity of two
|
||||
vectors. We'll use the cosine metric:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> from numpy import dot
|
||||
| >>> from numpy.linalg import norm
|
||||
|
||||
| >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
| >>> words = [w for w in nlp.vocab if w.has_repvec]
|
||||
| >>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
||||
| >>> words.reverse()
|
||||
| >>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
|
||||
| 1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading
|
||||
| >>> print('50-60', ', '.join(w.orth_ for w in words[50:60]))
|
||||
| 50-60 counselled, bragged, backtracked, caucused, refiled, dueled, mused, dissented, yearned, confesses
|
||||
| >>> print('100-110', ', '.join(w.orth_ for w in words[100:110]))
|
||||
| 100-110 cabled, ducked, sentenced, perjured, absconded, bargained, overstayed, clerked, confided, sympathizes
|
||||
| >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010]))
|
||||
| 1000-1010 scorned, baled, righted, requested, swindled, posited, firebombed, slimed, deferred, sagged
|
||||
| >>> print('50000-50010', ', '.join(w.orth_ for w in words[50000:50010]))
|
||||
| 50000-50010, fb, ford, systems, puck, anglers, ik, tabloid, dirty, rims, artists
|
||||
|
||||
:markdown-it
|
||||
As you can see, the similarity model that these vectors give us is excellent
|
||||
--- we're still getting meaningful results at 1000 words, off a single
|
||||
prototype! The only problem is that the list really contains two clusters of
|
||||
words: one associated with the legal meaning of "pleaded", and one for the more
|
||||
general sense. Sorting out these clusters is an area of active research.
|
||||
|
||||
A simple work-around is to average the vectors of several words, and use that
|
||||
as our target:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested']
|
||||
| >>> say_vector = sum(nlp.vocab[verb].repvec for verb in say_verbs) / len(say_verbs)
|
||||
| >>> words.sort(key=lambda w: cosine(w.repvec * say_vector))
|
||||
| >>> words.reverse()
|
||||
| >>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
|
||||
| 1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired
|
||||
| >>> print('50-60', ', '.join(w.orth_ for w in words[50:60]))
|
||||
| 50-60 flaunted, quarrelled, ingratiated, vouched, agonized, apologised, lunched, joked, chafed, schemed
|
||||
| >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010]))
|
||||
| 1000-1010 hoarded, waded, ensnared, clamoring, abided, deploring, shriveled, endeared, rethought, berate
|
||||
|
||||
:markdown-it
|
||||
These definitely look like words that King might scold a writer for attaching
|
||||
adverbs to. Recall that our original adverb highlighting function looked like
|
||||
this:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV
|
||||
| >>> # Load the pipeline, and call it with some text.
|
||||
| >>> nlp = spacy.en.English()
|
||||
| >>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
|
||||
| tag=True, parse=False)
|
||||
| >>> print(''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens))
|
||||
| ‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
|
||||
:markdown-it
|
||||
We wanted to refine the logic so that only adverbs modifying evocative
|
||||
verbs of communication, like "pleaded", were highlighted. We've now
|
||||
built a vector that represents that type of word, so now we can highlight
|
||||
adverbs based on subtle logic, honing in on adverbs that seem the most
|
||||
stylistically problematic, given our starting assumptions:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> import numpy
|
||||
| >>> from numpy import dot
|
||||
| >>> from numpy.linalg import norm
|
||||
| >>> import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV, VERB
|
||||
| >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
| >>> def is_bad_adverb(token, target_verb, tol):
|
||||
| ... if token.pos != ADV
|
||||
| ... return False
|
||||
| ... elif token.head.pos != VERB:
|
||||
| ... return False
|
||||
| ... elif cosine(token.head.repvec, target_verb) < tol:
|
||||
| ... return False
|
||||
| ... else:
|
||||
| ... return True
|
||||
|
||||
:markdown-it
|
||||
This example was somewhat contrived --- and, truth be told, I've never
|
||||
really bought the idea that adverbs were a grave stylistic sin. But
|
||||
hopefully it got the message across: the state-of-the-art NLP technologies
|
||||
are very powerful. spaCy gives you easy and efficient access to them,
|
||||
which lets you build all sorts of useful products and features that
|
||||
were previously impossible.
|
||||
|
||||
footer(role='contentinfo')
|
||||
script(src='js/prism.js')
|
|
@ -1,132 +0,0 @@
|
|||
doctype html
|
||||
html(lang='en')
|
||||
head
|
||||
meta(charset='utf-8')
|
||||
title spaCy Blog
|
||||
meta(name='description', content='')
|
||||
meta(name='author', content='Matthew Honnibal')
|
||||
link(rel='stylesheet', href='css/style.css')
|
||||
//if lt IE 9
|
||||
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
|
||||
body#blog
|
||||
header(role='banner')
|
||||
h1.logo spaCy Blog
|
||||
.slogan Blog
|
||||
main#content(role='main')
|
||||
section.intro
|
||||
p
|
||||
| Example use of the spaCy NLP tools for data exploration.
|
||||
| Here we will look for reddit comments that describe Google doing something,
|
||||
| i.e. discuss the company's actions. This is difficult, because other senses of
|
||||
| "Google" now dominate usage of the word in conversation, particularly references to
|
||||
| using Google products.
|
||||
|
||||
p
|
||||
| The heuristics used are quick and dirty – about 5 minutes work.
|
||||
|
||||
//| A better approach is to use the word vector of the verb. But, the
|
||||
// | demo here is just to show what's possible to build up quickly, to
|
||||
// | start to understand some data.
|
||||
|
||||
article.post
|
||||
header
|
||||
h2 Syntax-specific Search
|
||||
.subhead
|
||||
| by
|
||||
a(href='#', rel='author') Matthew Honnibal
|
||||
| on
|
||||
time(datetime='2015-08-14') August
|
||||
|
||||
details
|
||||
summary: h4 Imports
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from __future__ import unicode_literals
|
||||
| from __future__ import print_function
|
||||
| import sys
|
||||
|
|
||||
| import plac
|
||||
| import bz2
|
||||
| import ujson
|
||||
| import spacy.en
|
||||
|
||||
details
|
||||
summary: h4 Load the model and iterate over the data
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def main(input_loc):
|
||||
| nlp = spacy.en.English() # Load the model takes 10-20 seconds.
|
||||
| for line in bz2.BZ2File(input_loc): # Iterate over the reddit comments from the dump.
|
||||
| comment_str = ujson.loads(line)['body'] # Parse the json object, and extract the 'body' attribute.
|
||||
|
|
||||
details
|
||||
summary: h4 Apply the spaCy NLP pipeline, and look for the cases we want
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| comment_parse = nlp(comment_str)
|
||||
| for word in comment_parse:
|
||||
| if google_doing_something(word):
|
||||
| # Print the clause
|
||||
| print(''.join(w.string for w in word.head.subtree).strip())
|
||||
details
|
||||
summary: h4 Define the filter function
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
|
||||
|
|
||||
| def google_doing_something(w):
|
||||
| if w.lower_ != 'google':
|
||||
| return False
|
||||
| # Is it the subject of a verb?
|
||||
| elif w.dep_ != 'nsubj':
|
||||
| return False
|
||||
| # And not 'is'
|
||||
| elif w.head.lemma_ == 'be' and w.head.dep_ != 'aux':
|
||||
| return False
|
||||
| # Exclude e.g. "Google says..."
|
||||
| elif w.head.lemma_ in ('say', 'show'):
|
||||
| return False
|
||||
| else:
|
||||
| return True
|
||||
|
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 Call main
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| if __name__ == '__main__':
|
||||
| plac.call(main)
|
||||
|
||||
details
|
||||
summary: h4 Example output
|
||||
|
||||
p.
|
||||
Many false positives remain. Some are from incorrect interpretations
|
||||
of the sentence by spaCy, some are flaws in our filtering logic. But
|
||||
the results are vastly better than a string-based search, which returns
|
||||
almost no examples of the pattern we're looking for.
|
||||
|
||||
code
|
||||
| Google dropped support for Android < 4.0 already
|
||||
| google drive
|
||||
| Google to enforce a little more uniformity in its hardware so that we can see a better 3rd party market for things like mounts, cases, etc
|
||||
| When Google responds
|
||||
| Google translate cyka pasterino.
|
||||
| A quick google looks like Synology does have a sync'ing feature which does support block level so that should work
|
||||
| (google came up with some weird One Piece/FairyTail crossover stuff), and is their knowledge universally infallible?
|
||||
| Until you have the gear, google some videos on best farming runs on each planet, you can get a lot REAL fast with the right loop.
|
||||
| Google offers something like this already, but it is truly terrible.
|
||||
| google isn't helping me
|
||||
| Google tells me: 0 results, 250 pages removed from google.
|
||||
| how did Google swoop in and eat our lunch
|
||||
|
||||
|
||||
|
||||
script(src="js/prism.js")
|
||||
script(src="js/details_polyfill.js")
|
|
@ -1,204 +0,0 @@
|
|||
doctype html
|
||||
html(lang='en')
|
||||
head
|
||||
meta(charset='utf-8')
|
||||
title spaCy Blog
|
||||
meta(name='description', content='')
|
||||
meta(name='author', content='Matthew Honnibal')
|
||||
link(rel='stylesheet', href='css/style.css')
|
||||
//if lt IE 9
|
||||
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
|
||||
body#blog
|
||||
header(role='banner')
|
||||
h1.logo spaCy Blog
|
||||
.slogan Blog
|
||||
main#content(role='main')
|
||||
article.post
|
||||
header
|
||||
h2 Finding Relevant Tweets
|
||||
.subhead
|
||||
| by
|
||||
a(href='#', rel='author') Matthew Honnibal
|
||||
| on
|
||||
time(datetime='2015-08-14') December
|
||||
|
||||
details
|
||||
summary: h4 Imports
|
||||
pre.language-python
|
||||
|
||||
| from __future__ import unicode_literals, print_function
|
||||
| import plac
|
||||
| import codecs
|
||||
| import sys
|
||||
| import math
|
||||
|
|
||||
| import spacy.en
|
||||
| from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ
|
||||
|
|
||||
| from termcolor import colored
|
||||
| from twython import TwythonStreamer
|
||||
|
|
||||
| from os import path
|
||||
| from math import sqrt
|
||||
|
|
||||
| from numpy import dot
|
||||
| from numpy.linalg import norm
|
||||
|
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 Simple vector-averaging similarity
|
||||
|
||||
pre.language-python: code
|
||||
|
||||
| class Meaning(object):
|
||||
| def __init__(self, vectors):
|
||||
| if vectors:
|
||||
| self.vector = sum(vectors) / len(vectors)
|
||||
| self.norm = norm(self.vector)
|
||||
| else:
|
||||
| self.vector = None
|
||||
| self.norm = 0
|
||||
|
|
||||
| @classmethod
|
||||
| def from_path(cls, nlp, loc):
|
||||
| with codecs.open(loc, 'r', 'utf8') as file_:
|
||||
| terms = file_.read().strip().split()
|
||||
| return cls.from_terms(nlp, terms)
|
||||
|
|
||||
| @classmethod
|
||||
| def from_tokens(cls, nlp, tokens):
|
||||
| vectors = [t.repvec for t in tokens]
|
||||
| return cls(vectors)
|
||||
|
|
||||
| @classmethod
|
||||
| def from_terms(cls, nlp, examples):
|
||||
| lexemes = [nlp.vocab[eg] for eg in examples]
|
||||
| vectors = [eg.repvec for eg in lexemes]
|
||||
| return cls(vectors)
|
||||
|
|
||||
| def similarity(self, other):
|
||||
| if not self.norm or not other.norm:
|
||||
| return -1
|
||||
| return dot(self.vector, other.vector) / (self.norm * other.norm)
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 Print matches
|
||||
|
||||
pre.language-python: code
|
||||
|
||||
|
|
||||
| def print_colored(model, stream=sys.stdout):
|
||||
| if model['is_match']:
|
||||
| color = 'green'
|
||||
| elif model['is_reject']:
|
||||
| color = 'red'
|
||||
| else:
|
||||
| color = 'grey'
|
||||
|
|
||||
| if not model['is_rare'] and model['is_match'] and not model['is_reject']:
|
||||
| match_score = colored('%.3f' % model['match_score'], 'green')
|
||||
| reject_score = colored('%.3f' % model['reject_score'], 'red')
|
||||
| prob = '%.5f' % model['prob']
|
||||
|
|
||||
| print(match_score, reject_score, prob)
|
||||
| print(repr(model['text']), color)
|
||||
| print('')
|
||||
|
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 TextMatcher: Process the tweets using spaCy
|
||||
|
||||
pre.language-python: code
|
||||
|
||||
| class TextMatcher(object):
|
||||
| def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject):
|
||||
| self.nlp = nlp
|
||||
| self.get_target = get_target
|
||||
| self.get_reject = get_reject
|
||||
| self.min_prob = min_prob
|
||||
| self.min_match = min_match
|
||||
| self.max_reject = max_reject
|
||||
|
|
||||
| def __call__(self, text):
|
||||
| tweet = self.nlp(text)
|
||||
| target_terms = self.get_target()
|
||||
| reject_terms = self.get_reject()
|
||||
|
|
||||
| prob = sum(math.exp(w.prob) for w in tweet) / len(tweet)
|
||||
| meaning = Meaning.from_tokens(self, tweet)
|
||||
|
|
||||
| match_score = meaning.similarity(self.get_target())
|
||||
| reject_score = meaning.similarity(self.get_reject())
|
||||
| return {
|
||||
| 'text': tweet.string,
|
||||
| 'prob': prob,
|
||||
| 'match_score': match_score,
|
||||
| 'reject_score': reject_score,
|
||||
| 'is_rare': prob < self.min_prob,
|
||||
| 'is_match': prob >= self.min_prob and match_score >= self.min_match,
|
||||
| 'is_reject': prob >= self.min_prob and reject_score >= self.max_reject
|
||||
| }
|
||||
|
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 Connect to Twitter and stream tweets
|
||||
|
||||
pre.language-python: code
|
||||
|
||||
| class Connection(TwythonStreamer):
|
||||
| def __init__(self, keys_dir, handler, view):
|
||||
| keys = Secrets(keys_dir)
|
||||
| TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret)
|
||||
| self.handler = handler
|
||||
| self.view = view
|
||||
|
|
||||
| def on_success(self, data):
|
||||
| text = data.get('text', u'')
|
||||
| # Twython returns either bytes or unicode, depending on tweet.
|
||||
| # #APIshaming
|
||||
| try:
|
||||
| model = self.handler(text)
|
||||
| except TypeError:
|
||||
| model = self.handler(text.decode('utf8'))
|
||||
| status = self.view(model, sys.stdin)
|
||||
|
|
||||
| def on_error(self, status_code, data):
|
||||
| print(status_code)
|
||||
|
|
||||
|
|
||||
| class Secrets(object):
|
||||
| def __init__(self, key_dir):
|
||||
| self.key = open(path.join(key_dir, 'key.txt')).read().strip()
|
||||
| self.secret = open(path.join(key_dir, 'secret.txt')).read().strip()
|
||||
| self.token = open(path.join(key_dir, 'token.txt')).read().strip()
|
||||
| self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip()
|
||||
|
|
||||
|
|
||||
|
||||
details
|
||||
summary: h4 Command-line interface
|
||||
|
||||
pre.language-python: code
|
||||
|
||||
| def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5):
|
||||
| # We don't need the parser for this demo, so may as well save the loading time
|
||||
| nlp = spacy.en.English(Parser=None)
|
||||
| get_target = lambda: Meaning.from_path(nlp, target_loc)
|
||||
| get_reject = lambda: Meaning.from_path(nlp, reject_loc)
|
||||
| matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject)
|
||||
|
|
||||
| twitter = Connection(keys_dir, matcher, print_colored)
|
||||
| twitter.statuses.filter(track=term)
|
||||
|
|
||||
|
|
||||
| if __name__ == '__main__':
|
||||
| plac.call(main)
|
||||
|
|
||||
|
||||
footer(role='contentinfo')
|
||||
script(src='js/prism.js')
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
mixin Tutorial(title)
|
||||
details
|
||||
summary
|
||||
h4= title
|
||||
|
||||
block
|
||||
|
||||
+Tutorial("Mark-up all manner adverbs, especially for verbs of speech")
|
||||
| Let's say you're developing a proofreading tool, or possibly an IDE for
|
||||
| writers. You're convinced by Stephen King's advice that
|
||||
| adverbs are not your friend
|
||||
| so you want to
|
||||
a.readmore(href='tute_adverbs.html')
|
||||
| highlight all adverbs. ►
|
||||
|
||||
+Tutorial("Search Reddit for comments about Google doing something")
|
||||
| Example use of the spaCy NLP tools for data exploration.
|
||||
| Here we will look for Reddit comments that describe Google doing something,
|
||||
| i.e. discuss the company's actions. This is difficult, because other
|
||||
| senses of "Google" now dominate usage of the word in conversation,
|
||||
| particularly references to using Google products.
|
||||
a.readmore(href='tute_adverbs.html')
|
||||
| ►
|
||||
|
||||
+Tutorial("Use word vectors for semantic search of Twitter")
|
||||
| Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore.
|
||||
| Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore.
|
||||
a.readmore(href='tute_twitter.html')
|
||||
| ►
|
|
@ -1,167 +0,0 @@
|
|||
mixin example(name)
|
||||
details
|
||||
summary
|
||||
h4= name
|
||||
block
|
||||
|
||||
|
||||
+example("Load resources and process text")
|
||||
pre.language-python: code
|
||||
| from __future__ import unicode_literals, print_function
|
||||
| from spacy.en import English
|
||||
| nlp = English()
|
||||
| doc = nlp('Hello, world. Here are two sentences.')
|
||||
|
||||
+example("Get tokens and sentences")
|
||||
pre.language-python: code
|
||||
| token = doc[0]
|
||||
| sentence = doc.sents[0]
|
||||
| assert token[0] is sentence[0]
|
||||
|
||||
+example("Use integer IDs for any string")
|
||||
pre.language-python: code
|
||||
| hello_id = nlp.vocab.strings['Hello']
|
||||
| hello_str = nlp.vocab.strings[hello_id]
|
||||
|
|
||||
| assert token.orth == hello_id == 52
|
||||
| assert token.orth_ == hello_str == 'Hello'
|
||||
|
||||
+example("Get and set string views and flags")
|
||||
pre.language-python: code
|
||||
| assert token.shape_ == 'Xxxx'
|
||||
| for lexeme in nlp.vocab:
|
||||
| if lexeme.is_alpha:
|
||||
| lexeme.shape_ = 'W'
|
||||
| elif lexeme.is_digit:
|
||||
| lexeme.shape_ = 'D'
|
||||
| elif lexeme.is_punct:
|
||||
| lexeme.shape_ = 'P'
|
||||
| else:
|
||||
| lexeme.shape_ = 'M'
|
||||
| assert token.shape_ == 'W'
|
||||
|
||||
+example("Export to numpy arrays")
|
||||
pre.language-python: code
|
||||
| from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
|
||||
|
|
||||
| attr_ids = [ORTH, LIKE_URL, IS_OOV]
|
||||
| doc_array = doc.to_array(attr_ids)
|
||||
| assert doc_array.shape == (len(doc), len(attrs)
|
||||
| assert doc[0].orth == doc_array[0, 0]
|
||||
| assert doc[1].orth == doc_array[1, 0]
|
||||
| assert doc[0].like_url == doc_array[0, 1]
|
||||
| assert doc_array[, 1] == [t.like_url for t in doc]
|
||||
|
||||
+example("Word vectors")
|
||||
pre.language-python: code
|
||||
| doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
|
||||
|
|
||||
| apples = doc[0]
|
||||
| oranges = doc[1]
|
||||
| boots = doc[6]
|
||||
| hippos = doc[8]
|
||||
|
|
||||
| assert apples.similarity(oranges) > boots.similarity(hippos)
|
||||
|
||||
|
||||
+example("Part-of-speech tags")
|
||||
pre.language-python: code
|
||||
| from spacy.parts_of_speech import ADV
|
||||
|
|
||||
| def is_adverb(token):
|
||||
| return token.pos == spacy.parts_of_speech.ADV
|
||||
|
|
||||
| # These are data-specific, so no constants are provided. You have to look
|
||||
| # up the IDs from the StringStore.
|
||||
| NNS = nlp.vocab.strings['NNS']
|
||||
| NNPS = nlp.vocab.strings['NNPS']
|
||||
| def is_plural_noun(token):
|
||||
| return token.tag == NNS or token.tag == NNPS
|
||||
|
|
||||
| def print_coarse_pos(token):
|
||||
| print(token.pos_)
|
||||
|
|
||||
| def print_fine_pos(token):
|
||||
| print(token.tag_)
|
||||
|
||||
+example("Syntactic dependencies")
|
||||
pre.language-python: code
|
||||
| def dependency_labels_to_root(token):
|
||||
| '''Walk up the syntactic tree, collecting the arc labels.'''
|
||||
| dep_labels = []
|
||||
| while token.root is not token:
|
||||
| dep_labels.append(token.dep)
|
||||
| token = token.head
|
||||
| return dep_labels
|
||||
|
||||
+example("Named entities")
|
||||
pre.language-python: code
|
||||
| def iter_products(docs):
|
||||
| for doc in docs:
|
||||
| for ent in doc.ents:
|
||||
| if ent.label_ == 'PRODUCT':
|
||||
| yield ent
|
||||
|
|
||||
| def word_is_in_entity(word):
|
||||
| return word.ent_type != 0
|
||||
|
|
||||
| def count_parent_verb_by_person(docs):
|
||||
| counts = defaultdict(defaultdict(int))
|
||||
| for doc in docs:
|
||||
| for ent in doc.ents:
|
||||
| if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
|
||||
| counts[ent.orth_][ent.root.head.lemma_] += 1
|
||||
| return counts
|
||||
|
||||
//+example("Define custom NER rules")
|
||||
// pre.language-python: code
|
||||
// | nlp.matcher
|
||||
|
||||
|
||||
+example("Calculate inline mark-up on original string")
|
||||
pre.language-python: code
|
||||
| def put_spans_around_tokens(doc, get_classes):
|
||||
| '''Given some function to compute class names, put each token in a
|
||||
| span element, with the appropriate classes computed.
|
||||
|
|
||||
| All whitespace is preserved, outside of the spans. (Yes, I know HTML
|
||||
| won't display it. But the point is no information is lost, so you can
|
||||
| calculate what you need, e.g. <br /> tags, <p> tags, etc.)
|
||||
| '''
|
||||
| output = []
|
||||
| template = '<span classes="{classes}">{word}</span>{space}'
|
||||
| for token in doc:
|
||||
| if token.is_space:
|
||||
| output.append(token.orth_)
|
||||
| else:
|
||||
| output.append(
|
||||
| template.format(
|
||||
| classes=' '.join(get_classes(token)),
|
||||
| word=token.orth_,
|
||||
| space=token.whitespace_))
|
||||
| string = ''.join(output)
|
||||
| string = string.replace('\n', '<br />')
|
||||
| string = string.replace('\t', ' '
|
||||
| return string
|
||||
|
||||
|
||||
+example("Efficient binary serialization")
|
||||
pre.language-python: code
|
||||
|
|
||||
| byte_string = doc.as_bytes()
|
||||
| open('/tmp/moby_dick.bin', 'wb').write(byte_string)
|
||||
|
|
||||
| nlp = spacy.en.English()
|
||||
| for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')):
|
||||
| doc = Doc(nlp.vocab)
|
||||
| doc.from_bytes(byte_string)
|
||||
|
||||
|
||||
p
|
||||
| See the
|
||||
a(href="docs.html") docs page
|
||||
| for
|
||||
a(href="docs.html#api") API documentation,
|
||||
a(href="docs.html#tutorials") tutorials,
|
||||
| and
|
||||
a(href="docs.html#spec") annotation specs.
|
Loading…
Reference in New Issue
Block a user