Merge branch 'master' of ssh://github.com/honnibal/spaCy

This commit is contained in:
Matthew Honnibal 2015-12-01 18:11:10 +01:00
commit 0ee89504a7
24 changed files with 46 additions and 1347 deletions

View File

@ -6,6 +6,7 @@ os:
python:
- "2.7"
- "3.4"
- "3.5"
# install dependencies
install:

View File

@ -41,6 +41,7 @@ Supports
* CPython 2.7
* CPython 3.4
* CPython 3.5
* OSX
* Linux
* Cygwin

15
fabfile.py vendored
View File

@ -60,7 +60,7 @@ def prebuild(build_dir='/tmp/build_spacy'):
local('py.test --models spacy/tests/')
def docs():
def web():
def jade(source_name, out_dir):
pwd = path.join(path.dirname(__file__), 'website')
jade_loc = path.join(pwd, 'src', 'jade', source_name)
@ -68,18 +68,25 @@ def docs():
local('jade -P %s --out %s' % (jade_loc, out_loc))
with virtualenv(VENV_DIR):
local('./website/create_code_samples tests/website/ website/src/code/')
local('./website/create_code_samples spacy/tests/website/ website/src/code/')
jade('404.jade', '')
jade('home/index.jade', '')
jade('docs/index.jade', 'docs/')
jade('blog/index.jade', 'blog/')
jade('tutorials/index.jade', 'tutorials/')
for post_dir in (Path(__file__).parent / 'website' / 'src' / 'jade' / 'blog').iterdir():
for collection in ('blog', 'tutorials'):
for post_dir in (Path(__file__).parent / 'website' / 'src' / 'jade' / collection).iterdir():
if post_dir.is_dir() \
and (post_dir / 'index.jade').exists() \
and (post_dir / 'meta.jade').exists():
jade(str(post_dir / 'index.jade'), path.join('blogs', post_dir.parts[-1]))
jade(str(post_dir / 'index.jade'), path.join(collection, post_dir.parts[-1]))
def web_publish(assets_path):
local('aws s3 sync --delete --exclude "resources/*" website/site/ s3://spacy.io')
local('aws s3 sync --delete %s s3://spacy.io/resources' % assets_path)
def publish(version):

View File

@ -10,4 +10,4 @@ plac
six
ujson
cloudpickle
sputnik == 0.5.1
sputnik == 0.5.2

View File

@ -144,7 +144,7 @@ def cython_setup(mod_names, language, includes):
author='Matthew Honnibal',
author_email='honnibal@gmail.com',
version=VERSION,
url="http://honnibal.github.io/spaCy/",
url="http://spacy.io",
package_data=PACKAGE_DATA,
ext_modules=exts,
cmdclass={'build_ext': build_ext_cython_subclass},
@ -179,7 +179,7 @@ def run_setup(exts):
license="MIT",
install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44',
'thinc == 4.0.0', "text_unidecode", 'plac', 'six',
'ujson', 'cloudpickle', 'sputnik == 0.5.1'],
'ujson', 'cloudpickle', 'sputnik == 0.5.2'],
setup_requires=["headers_workaround"],
cmdclass = {'build_ext': build_ext_subclass },
)

View File

@ -15,7 +15,7 @@ The Stack
--------
The site is built with the [Jade](http://jade-lang.com/) template language.
See [the Makefile](Makefile) for more
See [fabfile.py](/fabfile.py) under ```web()``` for more
Developing
@ -23,8 +23,7 @@ Developing
To make and test changes
```
npm install jade --global
cd website
make
python -m SimpleHTTPServer 8000
fab web
cd website/site; python -m SimpleHTTPServer 8000; cd -
```
Then visit [localhost:8000/src/...](http://localhost:8000/src/)
Then visit [localhost:8000](http://localhost:8000)

View File

@ -26,7 +26,7 @@ def main(src_dirname, dst_dirname):
continue
# Remove test_ prefix and .py suffix
name = filename[6:-3]
name = filename[5:-3]
with io.open(os.path.join(src_dirname, filename), 'r', encoding='utf8') as file_:
source = file_.readlines()
tree = ast.parse("".join(source))

View File

@ -1,67 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>404 | spaCy.io</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
<meta name="description" content="spaCy is a library for industrial-strength text processing in Python. If you're a small company doing NLP, we want spaCy to seem like a minor miracle.">
<meta itemporop="name" content="404 | spaCy.io">
<meta itemprop="description" content="spaCy is a library for industrial-strength text processing in Python. If you're a small company doing NLP, we want spaCy to seem like a minor miracle.">
<meta itemprop="image" content="http://spacy.io/resources/img/social.png">
<meta name="twitter:card" content="summary">
<meta name="twitter:site" content="spacy_io">
<meta name="twitter:title" content="404 | spaCy.io">
<meta name="twitter:description" content="spaCy is a library for industrial-strength text processing in Python. If you're a small company doing NLP, we want spaCy to seem like a minor miracle.">
<meta name="twitter:creator" content="@spacy_io">
<meta name="twitter:image" content="http://spacy.io/resources/img/social_small.png">
<meta property="og:title" content="404 | spaCy.io">
<meta property="og:type" content="article">
<meta property="og:url" content="http://spacy.io/">
<meta property="og:image" content="http://spacy.io/resources/img/social.png">
<meta property="og:description" content="spaCy is a library for industrial-strength text processing in Python. If you're a small company doing NLP, we want spaCy to seem like a minor miracle.">
<meta property="og:site_name" content="spaCy.io">
<meta property="article:published_time">
<link rel="stylesheet" href="/resources/css/style.css">
<!--[if lt IE 9]><script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]-->
</head>
<body id="">
<header role="banner">
<h1 class="logo">spaCy.io</h1>
<div class="slogan">
</div>
</header>
<nav role="navigation">
<li><a href="/">Home</a></li>
<li><a href="/docs">Docs</a></li>
<li><a href="/license">License</a></li>
<li><a href="/blog">Blog</a></li>
</nav>
<main id="content">
</main>
<script src="/resources/js/prism.min.js"></script>
<!-- Details polyfill-->
<script>
var details = document.getElementsByTagName("details");
var summary = document.getElementsByTagName("summary");
for(var i = 0; i < details.length; i++) {
(details[i].getAttribute("open") == null) ? details[i].setAttribute("data-open", "false") : details[i].setAttribute("data-open", "true");
}
for(var i = 0; i < summary.length; i++) {
summary[i].addEventListener( "click", function(e) {
var parent = this.parentElement;
(parent.getAttribute("data-open") == "false") ? parent.setAttribute("data-open", "true") : parent.setAttribute("data-open", "false");
});
}
</script>
<!-- Google analytics-->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-58931649-1', 'auto');
ga('send', 'pageview');
</script>
<footer role="contentinfo"><span class="slogan copyright">&copy; 2015 Syllogism Co. | <a href="mailto:contact@spacy.io">Contact</a></span></footer>
</body>
</html>

View File

@ -1,97 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>Natural Language Processing Software Badly Needs Some Deprecation Notices | spaCy.io</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
<meta name="description" content="Imagine: you go to use Google, but before you can search, you first have to select which model you want. Of course, this isn't how Google operates. They just give you the best model. This is what spaCy does, too, because we actually care whether the model you use is good. Most NLP libraries apparently don't.">
<meta itemporop="name" content="Natural Language Processing Software Badly Needs Some Deprecation Notices">
<meta itemprop="description" content="Imagine: you go to use Google, but before you can search, you first have to select which model you want. Of course, this isn't how Google operates. They just give you the best model. This is what spaCy does, too, because we actually care whether the model you use is good. Most NLP libraries apparently don't.">
<meta itemprop="image" content="http://spacy.io/resources/img/social.png">
<meta name="twitter:card" content="summary">
<meta name="twitter:site" content="spacy_io">
<meta name="twitter:title" content="Natural Language Processing Software Badly Needs Some Deprecation Notices">
<meta name="twitter:description" content="Imagine: you go to use Google, but before you can search, you first have to select which model you want. Of course, this isn't how Google operates. They just give you the best model. This is what spaCy does, too, because we actually care whether the model you use is good. Most NLP libraries apparently don't.">
<meta name="twitter:creator" content="@honnibal">
<meta name="twitter:image" content="http://spacy.io/resources/img/social_small.png">
<meta property="og:title" content="Natural Language Processing Software Badly Needs Some Deprecation Notices">
<meta property="og:type" content="article">
<meta property="og:url" content="http://spacy.io/blog/introducing-spacy">
<meta property="og:image" content="http://spacy.io/resources/img/social.png">
<meta property="og:description" content="Imagine: you go to use Google, but before you can search, you first have to select which model you want. Of course, this isn't how Google operates. They just give you the best model. This is what spaCy does, too, because we actually care whether the model you use is good. Most NLP libraries apparently don't.">
<meta property="og:site_name" content="spaCy.io">
<meta property="article:published_time" content="2015-02-19T00:00:00.000Z">
<link rel="stylesheet" href="/resources/css/style.css">
<!--[if lt IE 9]><script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]-->
</head>
<body id="blog">
<header role="banner">
<h1 class="logo">spaCy.io</h1>
<div class="slogan">Blog
</div>
</header>
<nav role="navigation">
<li><a href="/">Home</a></li>
<li><a href="/docs">Docs</a></li>
<li><a href="/displacy" target="_blank">Demo</a></li>
<li><a href="/license">License</a></li>
<li class="active"><a href="/blog">Blog</a></li>
</nav>
<main id="content">
<article class="post">
<header>
<h2>
<strike></strike>
</h2>
<h2>Natural Language Processing Software Badly Needs Some Deprecation Notices</h2>
<div class="subhead">by <a href="//twitter.com/honnibal" rel="author" target="_blank">Matthew Honnibal</a> on
<time>February 19, 2015</time>
</div>
</header>
<p>Imagine: you try to use Google Translate, but it asks you to first select which model you want. The new, awesome deep-learning model is there, but so are lots of others. You pick one that sounds fancy, but it turns out it's a 20-year old experimental model trained on a corpus of oven manuals. You are not interested in over manuals.</p>
<p>Of course, this is not how Google Translate operates. They make sure the model you use is good. This is what spaCy does, too. But most natural language understanding libraries, it's just not anybody's job to delete obsolete models. There's also a real reluctance to editorialize. Some advantage can be found for every model. Like, is it really fair to call that oven-specific model obsolete? In some ways we still have a lot to learn from its principled approach. And what if someone needs to translate an oven manual?</p>
<p>Have a look through the <a href="http://gate.ac.uk/sale/tao/split.html">GATE software</a>. There's a lot there, developed over 12 years and many person-hours. But there's approximately zero curation. The philosophy is just to provide things. It's up to you to decide what to use.</p>
<p>This is bad. It's bad to provide an implementation of <a href="https://gate.ac.uk/sale/tao/splitch18.html">MiniPar</a>, and have it just...sit there, with no hint that it's 20 years old and should not be used. The RASP parser, too. Why are these provided? Worse, why is there no warning? Unless you want to investigate the history of the field, there's no reason to execute these programs in 2015.</p>
<p><a href="http://webdocs.cs.ualberta.ca/~lindek/minipar.htm">Check out</a> how <a href="http://research.google.com/pubs/author108.html">Dekang Lin</a>, the author of Minipar, presents the software &ndash; with reference to a benchmark on a Pentium II. This is the right way to archive the program. In this form its status is clear.</p>
<p>Various people have asked me why I decided to make a new Python NLP library, <a href="http://spacy.io">spaCy</a>, instead of supporting the <a href="http://nltk.org">NLTK</a> project. There are many things I dislike about the NLTK code-base, but the lack of curation is really my key complaint: the project simply doesn't throw anything away, and it refuses to call any technique or implementation good or bad. </p>
<p>In March NLTK announced the inclusion of a more up-to-date dependency parsing algorithm, based on the linear-time algorithm everyone is now using. There was some excitement about this, as this type of parser really should get much better accuracy than the other algorithms NLTK includes. But can you tell <a href="http://www.nltk.org/py-modindex.html">which of these parsers is the new one?</a></p>
<p>The best parser there &ndash; the new one &ndash; is called "transition parser". But it's still not actually good. Unfortunately, the NLTK implementation is based on Nivre's original 2003 paper, instead of using the recent research; and they use external, general-purpose machine learning libraries, instead of a simple custom implementation that would perform much better. Together these limitations mean the performance of the model is terrible, relative to the current state-of-the-art.</p>
<p>I happened to visit the NLTK issue tracker while they were discussing the transition-based parser, so I linked them to my post explaining how to implement this parser in 500 lines of Python. I got a "thanks but no thanks", and <a href="https://github.com/nltk/nltk/issues/694">the issue was abruptly closed</a>. Another researcher's offer from 2012 to implement this type of model also went <a href="http://arxiv.org/pdf/1409.7386v1.pdf">unanswered</a>.</p>
<p>An enormous amount of work has gone into, and is still going into, making NLTK an easily accessible way for computer science students to learn a little bit about linguistics, or for linguistics students to learn a little bit about computer science. I respect that work.</p>
<p>But nowhere does it say that if you want to really build something, or do up-to-date research, NLTK isn't for you. NLTK claims it can serve that use-case. But it can't. The implication is that if you use the models provided in NLTK, e.g. its chunker, tagger, dependency parser etc, these will be roughly equivalent to what you'll get elsewhere. But they're not. The gulf in quality is enormous. <a href="https://github.com/nltk/nltk/issues/1063">NLTK does not even know how its POS tagger was trained</a>. The model is just this .pickle file that's been passed around for 5 years, its origins lost to time. This is not okay. </p>
<p>I think open source software should be very careful to make its limitations clear. It's a disservice to provide something that's much less useful than you imply. It's like offering your friend a lift and then not showing up. It's totally fine to not do something &ndash; so long as you never suggested you were going to do it. There are ways to do worse than nothing. </p>
<footer role="contentinfo" class="meta"><a href="http://twitter.com/share?text=Natural Language Processing Software Badly Needs Some Deprecation Notices&amp;url=http://spacy.io/blog/introducing-spacy&amp;via=spacy_io" title="Share on Twitter" target="_blank" class="button button-twitter">Share on Twitter </a>
<div class="discuss"> <a target="_blank" href="https://www.reddit.com/r/programming/comments/2tlyrr/spacy_industrialstrength_nlp_with_pythoncython" title="Discuss on Reddit" class="button button-reddit">Reddit Thread</a> <a target="_blank" href="https://news.ycombinator.com/item?id=8942783" title="Discuss on Hacker News Thread" class="button button-hn">Hacker News</a>
</div>
<section class="intro profile">
<p><img src="/resources/img/matt.png"> Matthew Honnibal is the author of the <a href="http://spacy.io">spaCy</a> software and the sole founder of its parent company. He studied linguistics as an undergrad, and never thought he'd be a programmer. By 2009 he had a PhD in computer science, and in 2014 he left academia to found Syllogism Co. He's from Sydney and lives in Berlin. <span class="social"><a href="//twitter.com/honnibal" target="_blank">Twitter</a></span></p>
</section>
</footer>
</article>
</main>
<script src="/resources/js/prism.min.js"></script>
<!-- Details polyfill-->
<script>
var details = document.getElementsByTagName("details");
var summary = document.getElementsByTagName("summary");
for(var i = 0; i < details.length; i++) {
(details[i].getAttribute("open") == null) ? details[i].setAttribute("data-open", "false") : details[i].setAttribute("data-open", "true");
}
for(var i = 0; i < summary.length; i++) {
summary[i].addEventListener( "click", function(e) {
var parent = this.parentElement;
(parent.getAttribute("data-open") == "false") ? parent.setAttribute("data-open", "true") : parent.setAttribute("data-open", "false");
});
}
</script>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-58931649-1', 'auto');
ga('send', 'pageview');
</script>
<footer role="contentinfo"><span class="slogan copyright">&copy; 2015 Syllogism Co. | <a href="mailto:contact@spacy.io">Contact</a></span></footer>
</body>
</html>

View File

@ -2,7 +2,7 @@ include ../../header.jade
include ./meta.jade
mixin Displacy(sentence, caption_text, height)
- var url = "/displacy/?full=" + sentence.replace(" ", "%20")
- var url = "http://api.spacy.io/displacy/?full=" + sentence.replace(" ", "%20")
.displacy
iframe.displacy(src="/resources/displacy/robots.html" height=height)
@ -20,7 +20,7 @@ mixin Displacy(sentence, caption_text, height)
p A syntactic dependency parse is a kind of shallow meaning representation. It's an important piece of many language understanding and text processing technologies. Now that these representations can be computed quickly, and with increasingly high accuracy, they're being used in lots of applications &ndash; translation, sentiment analysis, and summarization are major application areas.
p I've been living and breathing similar representations for most of my career. But there's always been a problem: talking about these things is tough. Most people haven't thought much about grammatical structure, and the idea of them is inherently abstract. When I left academia to write #[a(href="http://spaCy.io") spaCy], I knew I wanted a good visualizer. Unfortunately, I also knew I'd never be the one to write it. I'm deeply graphically challenged. Fortunately, when working with #[a(href="http://ines.io") Ines] to build this site, she really nailed the problem, with a solution I'd never have thought of. I really love the result, which we're calling #[a(href="/displacy") displaCy]:
p I've been living and breathing similar representations for most of my career. But there's always been a problem: talking about these things is tough. Most people haven't thought much about grammatical structure, and the idea of them is inherently abstract. When I left academia to write #[a(href="http://spaCy.io") spaCy], I knew I wanted a good visualizer. Unfortunately, I also knew I'd never be the one to write it. I'm deeply graphically challenged. Fortunately, when working with #[a(href="http://ines.io") Ines] to build this site, she really nailed the problem, with a solution I'd never have thought of. I really love the result, which we're calling #[a(href="http://api.spacy.io/displacy") displaCy]:
+Displacy("Robots in popular culture are there to remind us of the awesomeness of unbounded human agency", "Click the button to full-screen and interact, or scroll to see the full parse.", 325)
@ -40,7 +40,7 @@ mixin Displacy(sentence, caption_text, height)
p To me, this seemed like witchcraft, or a hack at best. But I was quickly won over: if all we do is declare the data and the relationships, in standards-compliant HTML and CSS, then we can simply step back and let the browser do its job. We know the code will be small, the layout will work on a variety of display, and we'll have a ready separation of style and content. For long output, we simply let the graphic overflow, and let users scroll.
p What I'm particularly excited about is the potential for displaCy as an #[a(href="http://spacy.io/displacy/?manual=Robots%20in%20popular%20culture%20are%20there%20to%20remind%20us%20of%20the%20awesomeness%20of%20unbounded%20human%20agency" target="_blank") annotation tool]. It may seem unintuitive at first, but I think it will be much better to annotate texts the way the parser operates, with a small set of actions and a stack, than by selecting arcs directly. Why? A few reasons:
p What I'm particularly excited about is the potential for displaCy as an #[a(href="http://api.spacy.io/displacy/?manual=Robots%20in%20popular%20culture%20are%20there%20to%20remind%20us%20of%20the%20awesomeness%20of%20unbounded%20human%20agency" target="_blank") annotation tool]. It may seem unintuitive at first, but I think it will be much better to annotate texts the way the parser operates, with a small set of actions and a stack, than by selecting arcs directly. Why? A few reasons:
ul
li You're always asked a question. You don't have to decide-what-to-decide.

View File

@ -10,7 +10,7 @@ include ./meta.jade
p It turns out that almost anything we say could mean many many different things, but we don't notice because almost all of those meanings would be weird or stupid or just not possible. If I say:
p.example #[a(href="http://spacy.io/displacy/?full=I%20saw%20a%20movie%20in%20a%20dress" target="_blank") I saw a movie in a dress]
p.example #[a(href="http://api.spacy.io/displacy/?full=I%20saw%20a%20movie%20in%20a%20dress" target="_blank") I saw a movie in a dress]
p Would you ever ask me,
@ -18,7 +18,7 @@ include ./meta.jade
p It's weird to even think of that. But a computer just might, because there are other cases like:
p.example #[a(href="http://spacy.io/displacy/?full=The%20TV%20showed%20a%20girl%20in%20a%20dress" target="_blank") The TV showed a girl in a dress]
p.example #[a(href="http://api.spacy.io/displacy/?full=The%20TV%20showed%20a%20girl%20in%20a%20dress" target="_blank") The TV showed a girl in a dress]
p Where the words hang together in the other way. People used to think that the answer was to tell the computer lots and lots of facts. But then you wake up one day and you're writing facts like #[em movies do not wear dresses], and you wonder where it all went wrong. Actually it's even worse than that. Not only are there too many facts, most of them are not even really facts! #[a(href="https://en.wikipedia.org/wiki/Cyc") People really tried this]. We've found that the world is made up of #[em if]s and #[em but]s.

View File

@ -98,7 +98,7 @@ include ./meta.jade
h3 Dependency Parser
p The parser uses the algorithm described in my #[a(href="parsing-english-in-python/") 2014 blog post]. This algorithm, shift-reduce dependency parsing, is becoming widely adopted due to its compelling speed/accuracy trade-off.
p The parser uses the algorithm described in my #[a(href="/blog/parsing-english-in-python/") 2014 blog post]. This algorithm, shift-reduce dependency parsing, is becoming widely adopted due to its compelling speed/accuracy trade-off.
p Some quick details about spaCy's take on this, for those who happen to know these models well. I'll write up a better description shortly.

View File

@ -19,4 +19,4 @@ include ./meta.jade
+TweetThis("Computers don't understand text. This is unfortunate, because that's what the web is mostly made of.", Meta.url)
p If none of that made any sense to you, here's the gist of it. Computers don't understand text. This is unfortunate, because that's what the web almost entirely consists of. We want to recommend people text based on other text they liked. We want to shorten text to display it on a mobile screen. We want to aggregate it, link it, filter it, categorise it, generate it and correct it.
p spaCy provides a library of utility functions that help programmers build such products. It's commercial open source software: you can either use it under the AGPL, or you can #[a(href="/license") buy a commercial license] under generous terms.
p spaCy provides a library of utility functions that help programmers build such products. It's commercial open source software: you can either use it under the AGPL, or you can buy a commercial license under generous terms (Note: #[a(href="/blog/spacy-now-mit/") spaCy is now licensed under MIT]).

View File

@ -1,535 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>Parsing English in 500 lines of Python | spaCy.io</title>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
<meta name="description" content="This post explains how transition-based dependency parsers work, and argues that this algorithm represents a break-through in natural language understanding. A concise sample implementation is provided, in 500 lines of Python, with no external dependencies. This post was written in 2013. In 2015 this type of parser is now increasingly dominant.">
<meta itemporop="name" content="Parsing English in 500 lines of Python">
<meta itemprop="description" content="This post explains how transition-based dependency parsers work, and argues that this algorithm represents a break-through in natural language understanding. A concise sample implementation is provided, in 500 lines of Python, with no external dependencies. This post was written in 2013. In 2015 this type of parser is now increasingly dominant.">
<meta itemprop="image" content="http://spacy.io/resources/img/social.png">
<meta name="twitter:card" content="summary">
<meta name="twitter:site" content="spacy_io">
<meta name="twitter:title" content="Parsing English in 500 lines of Python">
<meta name="twitter:description" content="This post explains how transition-based dependency parsers work, and argues that this algorithm represents a break-through in natural language understanding. A concise sample implementation is provided, in 500 lines of Python, with no external dependencies. This post was written in 2013. In 2015 this type of parser is now increasingly dominant.">
<meta name="twitter:creator" content="@honnibal">
<meta name="twitter:image" content="http://spacy.io/resources/img/social_small.png">
<meta property="og:title" content="Parsing English in 500 lines of Python">
<meta property="og:type" content="article">
<meta property="og:url" content="http://spacy.io/blog/parsing-english-in-python">
<meta property="og:image" content="http://spacy.io/resources/img/social.png">
<meta property="og:description" content="This post explains how transition-based dependency parsers work, and argues that this algorithm represents a break-through in natural language understanding. A concise sample implementation is provided, in 500 lines of Python, with no external dependencies. This post was written in 2013. In 2015 this type of parser is now increasingly dominant.">
<meta property="og:site_name" content="spaCy.io">
<meta property="article:published_time" content="2013-12-18T00:00:00.000Z">
<link rel="stylesheet" href="/resources/css/style.css">
<!--[if lt IE 9]><script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]-->
</head>
<body id="blog">
<header role="banner">
<h1 class="logo">spaCy.io</h1>
<div class="slogan">Blog
</div>
</header>
<nav role="navigation">
<li><a href="/">Home</a></li>
<li><a href="/docs">Docs</a></li>
<li><a href="/license">License</a></li>
<li class="active"><a href="/blog">Blog</a></li>
</nav>
<main id="content">
<article class="post">
<header>
<h2>Parsing English in 500 lines of Python</h2>
<div class="subhead">by <a href="//twitter.com/honnibal" rel="author" target="_blank">Matthew Honnibal</a> on
<time>December 18, 2013</time>
</div>
</header>
<p class="box infobox"><strong class="note">2015-08-19 Update:</strong> I wrote this blog post in 2013, describing an exiciting advance in natural language understanding technology. Today, almost all high-performance parsers are using a variant of the algorithm described below (including spaCy). The original post is preserved below, with added commentary in light of recent research.</p>
<p>A <a href="http://googleresearch.blogspot.de/2013/05/syntactic-ngrams-over-time.html">syntactic parser</a> describes a sentences grammatical structure, to help another application reason about it. Natural languages introduce many unexpected ambiguities, which our world-knowledge immediately filters out. A favourite example:</p>
<p class="example">They ate the pizza with anchovies</p>
<p><img src="/resources/img/anchovies.png" alt="Eat-with pizza-with ambiguity"></p>
<p>A correct parse links “with” to “pizza”, while an incorrect parse links “with” to “eat”:</p>
<div class="displacy">
<iframe src="/resources/displacy/anchovies_bad.html" height="275"></iframe>
</div>
<div class="displacy">
<iframe src="/resources/displacy/anchovies_good.html" height="275" class="displacy"></iframe>
<p class="caption">Prepositional phrase attachment is a common source of errors for statistical parsers.</p>
</div>
<p>The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. Its now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.</p>
<p class="box infobox"><strong class="note">Update:</strong> CoreNLP now features high-performance transition-based models. It is much faster than the Redshift parser (my research system), but less accurate. spaCy is faster again still, more accurate than CoreNLP, but less accurate than Redshift, due to spaCy's use of greedy search. It would be relatively easy to provide a beam-search version of spaCy...But, I think the gap in accuracy will continue to close, especially given advances in neural network learning.</p>
<table>
<thead>
<tr>
<th>Parser</th>
<th>Accuracy</th>
<th>Speed (w/s)</th>
<th>Language</th>
<th>LOC</th>
</tr>
</thead>
<tbody>
<tr>
<td>Stanford</td>
<td>89.6%</td>
<td>19</td>
<td>Java</td>
<td>&gt; 4,000 <sup><a href="#note-1">[1]</a></sup></td>
</tr>
<tr>
<td><strong>parser.py</strong></td>
<td>89.8%</td>
<td>2,020</td>
<td>Python</td>
<td><strong>~500</strong></td>
</tr>
<tr>
<td>Redshift</td>
<td><strong>93.6%</strong></td>
<td><strong>2,580</strong></td>
<td>Cython</td>
<td>~4,000</td>
</tr>
</tbody>
</table>
<p>The rest of the post sets up the problem, and then takes you through <a href="https://gist.github.com/syllog1sm/10343947">a concise implementation</a>, prepared for this post. The first 200 lines of parser.py, the part-of-speech tagger and learner, are described <a href="#">here</a>. You should probably at least skim that post before reading this one, unless youre very familiar with NLP research.</p>
<p>The Cython system, Redshift, was written for my current research. I plan to improve it for general use in June, after my contract ends at Macquarie University. The current version is <a href="http://github.com/syllog1sm/redshift">hosted on GitHub</a>.</p>
<h3>Problem Description</h3>
<p>Itd be nice to type an instruction like this into your phone:</p>
<p class="example">Set volume to zero when Im in a meeting, unless Johns school calls.</p>
<p>And have it set the appropriate policy. On Android you can do this sort of thing with <a href="https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm">Tasker</a>, but an NL interface would be much better. Itd be especially nice to receive a meaning representation you could edit, so you could see what it thinks you said, and correct it.</p>
<p>There are lots of problems to solve to make that work, but some sort of syntactic representation is definitely necessary. We need to know that:</p>
<p class="example">Unless Johns school calls, when Im in a meeting, set volume to zero</p>
<p>is another way of phrasing the first instruction, while:</p>
<p class="example">Unless Johns school, call when Im in a meeting</p>
<p>means something completely different.</p>
<p>A dependency parser returns a graph of word-word relationships, intended to make such reasoning easier. Our graphs will be trees &ndash; edges will be directed, and every node (word) will have exactly one incoming arc (one dependency, with its head), except one.</p>
<h4>Example usage</h4>
<pre class="language-python"><code>parser = parser.Parser()
tokens = "Set the volume to zero when I 'm in a meeting unless John 's school calls".split()
&gt;&gt;&gt; tags, heads = parser.parse(tokens)
&gt;&gt;&gt; heads
[-1, 2, 0, 0, 3, 0, 7, 5, 7, 10, 8, 0, 13, 15, 15, 11]
&gt;&gt;&gt; for i, h in enumerate(heads):
... head = tokens[heads[h]] if h &gt;= 1 else 'None'
... print(tokens[i] + ' &lt;-- ' + head])
Set &lt;-- None
the &lt;-- volume
volume &lt;-- Set
to &lt;-- Set
zero &lt;-- to
when &lt;-- Set
I &lt;-- 'm
'm &lt;-- when
in &lt;-- 'm
a &lt;-- meeting
meeting &lt;-- in
unless &lt;-- Set
John &lt;-- 's
's &lt;-- calls
school &lt;-- calls
calls &lt;-- unless</code></pre>
<p>The idea is that it should be slightly easier to reason from the parse, than it was from the string. The parse-to-meaning mapping is hopefully simpler than the string-to-meaning mapping.</p>
<p>The most confusing thing about this problem area is that “correctness” is defined by convention — by annotation guidelines. If you havent read the guidelines and youre not a linguist, you cant tell whether the parse is “wrong” or “right”, which makes the whole task feel weird and artificial.</p>
<p>For instance, theres a mistake in the parse above: “Johns school calls” is structured wrongly, according to the Stanford annotation guidelines. The structure of that part of the sentence is how the annotators were instructed to parse an example like “Johns school clothes”.</p>
<p>Its worth dwelling on this point a bit. We could, in theory, have written our guidelines so that the “correct” parses were reversed. Theres good reason to believe the parsing task will be harder if we reversed our convention, as itd be less consistent with the rest of the grammar. <sup><a href="#note-2">[2]</a></sup> But we could test that empirically, and wed be pleased to gain an advantage by reversing the policy.</p>
<p>We definitely do want that distinction in the guidelines — we dont want both to receive the same structure, or our output will be less useful. The annotation guidelines strike a balance between what distinctions downstream applications will find useful, and what parsers will be able to predict easily.</p>
<h4>Projective trees</h4>
<p>Theres a particularly useful simplification that we can make, when deciding what we want the graph to look like: we can restrict the graph structures well be dealing with. This doesnt just give us a likely advantage in learnability; it can have deep algorithmic implications. We follow most work on English in constraining the dependency graphs to be <em>projective trees</em>:</p>
<ol>
<li>Tree. Every word has exactly one head, except for the dummy ROOT symbol.</li>
<li>Projective. For every pair of dependencies (a1, a2) and (b1, b2), if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”. You cant have a pair of dependencies that goes a1 b1 a2 b2, or b1 a1 b2 a2.</li>
</ol>
<p>Theres a rich literature on parsing non-projective trees, and a smaller literature on parsing DAGs. But the parsing algorithm Ill be explaining deals with projective trees.</p>
<h3>Greedy transition-based parsing</h3>
<p>Our parser takes as input a list of string tokens, and outputs a list of head indices, representing edges in the graph. If the <em>i</em>th member of heads is <em>j</em>, the dependency parse contains an edge (j, i). A transition-based parser is a finite-state transducer; it maps an array of N words onto an output array of N head indices:</p>
<table class="center">
<tbody>
<tr>
<td><em>start</em></td>
<td>MSNBC</td>
<td>reported</td>
<td>that</td>
<td>Facebook</td>
<td>bought</td>
<td>WhatsApp</td>
<td>for</td>
<td>$16bn</td>
<td><em>root</em></td>
</tr>
<tr>
<td>0</td>
<td>2</td>
<td>9</td>
<td>2</td>
<td>4</td>
<td>2</td>
<td>4</td>
<td>4</td>
<td>7</td>
<td>0</td>
</tr>
</tbody>
</table>
<p>The heads array denotes that the head of <em>MSNBC</em> is <em>reported</em>:
<MSNBC></MSNBC> is word 1, and <em>reported</em> is word 2, and <code class="language-python">heads[1] == 2</code>. You can already see why parsing a tree is handy — this data structure wouldnt work if we had to output a DAG, where words may have multiple heads.
</p>
<p>Although <code class="language-python">heads</code> can be represented as an array, wed actually like to maintain some alternate ways to access the parse, to make it easy and efficient to extract features. Our <code class="language-python">Parse</code> class looks like this:</p>
<pre class="language-python"><code>class Parse(object):
def __init__(self, n):
self.n = n
self.heads = [None] * (n-1)
self.lefts = []
self.rights = []
for i in range(n+1):
self.lefts.append(DefaultList(0))
self.rights.append(DefaultList(0))
def add_arc(self, head, child):
self.heads[child] = head
if child &lt; head:
self.lefts[head].append(child)
else:
self.rights[head].append(child)</code></pre>
<p>As well as the parse, we also have to keep track of where were up to in the sentence. Well do this with an index into the <code class="language-python">words</code> array, and a stack, to which well push words, before popping them once their head is set. So our state data structure is fundamentally:</p>
<ul>
<li>An index, i, into the list of tokens;</li>
<li>The dependencies added so far, in Parse</li>
<li>A stack, containing words that occurred before i, for which were yet to assign a head.</li>
</ul>
<p>Each step of the parsing process applies one of three actions to the state:</p>
<pre class="language-python"><code>SHIFT = 0; RIGHT = 1; LEFT = 2
MOVES = [SHIFT, RIGHT, LEFT]
def transition(move, i, stack, parse):
global SHIFT, RIGHT, LEFT
if move == SHIFT:
stack.append(i)
return i + 1
elif move == RIGHT:
parse.add_arc(stack[-2], stack.pop())
return i
elif move == LEFT:
parse.add_arc(i, stack.pop())
return i
raise GrammarError(&quot;Unknown move: %d&quot; % move)</code></pre>
<p>The <code class="language-python">LEFT</code> and <code class="language-python">RIGHT</code> actions add dependencies and pop the stack, while <code class="language-python">SHIFT</code> pushes the stack and advances i into the buffer.</p>
<p>So, the parser starts with an empty stack, and a buffer index at 0, with no dependencies recorded. It chooses one of the (valid) actions, and applies it to the state. It continues choosing actions and applying them until the stack is empty and the buffer index is at the end of the input. (Its hard to understand this sort of algorithm without stepping through it. Try coming up with a sentence, drawing a projective parse tree over it, and then try to reach the parse tree by choosing the right sequence of transitions.)</p>
<p>Heres what the parsing loop looks like in code:</p>
<pre class="language-python"><code>class Parser(object):
...
def parse(self, words):
tags = self.tagger(words)
n = len(words)
idx = 1
stack = [0]
deps = Parse(n)
while stack or idx &lt; n:
features = extract_features(words, tags, idx, n, stack, deps)
scores = self.model.score(features)
valid_moves = get_valid_moves(i, n, len(stack))
next_move = max(valid_moves, key=lambda move: scores[move])
idx = transition(next_move, idx, stack, parse)
return tags, parse
def get_valid_moves(i, n, stack_depth):
moves = []
if i &lt; n:
moves.append(SHIFT)
if stack_depth &lt;= 2:
moves.append(RIGHT)
if stack_depth &lt;= 1:
moves.append(LEFT)
return moves</code></pre>
<p>We start by tagging the sentence, and initializing the state. We then map the state to a set of features, which we score using a linear model. We then find the best-scoring valid move, and apply it to the state.</p>
<p>The model scoring works the same as it did in <a href="#">the POS tagger</a>. If youre confused about the idea of extracting features and scoring them with a linear model, you should review that post. Heres a reminder of how the model scoring works:</p>
<pre class="language-python"><code>class Perceptron(object)
...
def score(self, features):
all_weights = self.weights
scores = dict((clas, 0) for clas in self.classes)
for feat, value in features.items():
if value == 0:
continue
if feat not in all_weights:
continue
weights = all_weights[feat]
for clas, weight in weights.items():
scores[clas] += value * weight
return scores</code></pre>
<p>Its just summing the class-weights for each feature. This is often expressed as a dot-product, but when youre dealing with multiple classes, that gets awkward, I find.</p>
<p>The beam parser (RedShift) tracks multiple candidates, and only decides on the best one at the very end. Were going to trade away accuracy in favour of efficiency and simplicity. Well only follow a single analysis. Our search strategy will be entirely greedy, as it was with the POS tagger. Well lock-in our choices at every step.</p>
<p>If you read the POS tagger post carefully, you might see the underlying similarity. What weve done is mapped the parsing problem onto a sequence-labelling problem, which we address using a “flat”, or unstructured, learning algorithm (by doing greedy search).</p>
<h3>Features</h3>
<p>Feature extraction code is always pretty ugly. The features for the parser refer to a few tokens from the context:</p>
<ul>
<li>The first three words of the buffer (n0, n1, n2)</li>
<li>The top three words of the stack (s0, s1, s2)</li>
<li>The two leftmost children of s0 (s0b1, s0b2);</li>
<li>The two rightmost children of s0 (s0f1, s0f2);</li>
<li>The two leftmost children of n0 (n0b1, n0b2)</li>
</ul>
<p>For these 12 tokens, we refer to the word-form, the part-of-speech tag, and the number of left and right children attached to the token.</p>
<p>Because were using a linear model, we have our features refer to pairs and triples of these atomic properties.</p>
<pre class="language-python"><code>def extract_features(words, tags, n0, n, stack, parse):
def get_stack_context(depth, stack, data):
if depth &gt;= 3:
return data[stack[-1]], data[stack[-2]], data[stack[-3]]
elif depth &gt;= 2:
return data[stack[-1]], data[stack[-2]], ''
elif depth == 1:
return data[stack[-1]], '', ''
else:
return '', '', ''
def get_buffer_context(i, n, data):
if i + 1 &gt;= n:
return data[i], '', ''
elif i + 2 &gt;= n:
return data[i], data[i + 1], ''
else:
return data[i], data[i + 1], data[i + 2]
def get_parse_context(word, deps, data):
if word == -1:
return 0, '', ''
deps = deps[word]
valency = len(deps)
if not valency:
return 0, '', ''
elif valency == 1:
return 1, data[deps[-1]], ''
else:
return valency, data[deps[-1]], data[deps[-2]]
features = {}
# Set up the context pieces --- the word, W, and tag, T, of:
# S0-2: Top three words on the stack
# N0-2: First three words of the buffer
# n0b1, n0b2: Two leftmost children of the first word of the buffer
# s0b1, s0b2: Two leftmost children of the top word of the stack
# s0f1, s0f2: Two rightmost children of the top word of the stack
depth = len(stack)
s0 = stack[-1] if depth else -1
Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words)
Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags)
Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words)
Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags)
Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words)
Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags)
Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words)
_, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags)
Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words)
_, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags)
Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words)
_, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags)
# Cap numeric features at 5?
# String-distance
Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0
features['bias'] = 1
# Add word and tag unigrams
for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2):
if w:
features['w=%s' % w] = 1
for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2):
if t:
features['t=%s' % t] = 1
# Add word/tag pairs
for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))):
if w or t:
features['%d w=%s, t=%s' % (i, w, t)] = 1
# Add some bigrams
features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1
features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1
features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1
features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1
features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1
features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1
features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1
features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1
# Add some tag trigrams
trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0),
(Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1),
(Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2),
(Ts0, Ts1, Ts1))
for i, (t1, t2, t3) in enumerate(trigrams):
if t1 or t2 or t3:
features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1
# Add some valency and distance features
vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b))
vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b))
d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0),
('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0))
for i, (w_t, v_d) in enumerate(vw + vt + d):
if w_t or v_d:
features['val/d-%d %s %d' % (i, w_t, v_d)] = 1
return features</code></pre>
<h3>Training</h3>
<p>Weights are learned using the same algorithm, averaged perceptron, that we used for part-of-speech tagging. Its key strength is that its an online learning algorithm: examples stream in one-by-one, we make our prediction, check the actual answer, and adjust our beliefs (weights) if we were wrong.</p>
<p>The training loop looks like this:</p>
<pre class="language-python"><code>class Parser(object):
...
def train_one(self, itn, words, gold_tags, gold_heads):
n = len(words)
i = 2; stack = [1]; parse = Parse(n)
tags = self.tagger.tag(words)
while stack or (i + 1) &lt; n:
features = extract_features(words, tags, i, n, stack, parse)
scores = self.model.score(features)
valid_moves = get_valid_moves(i, n, len(stack))
guess = max(valid_moves, key=lambda move: scores[move])
gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads)
best = max(gold_moves, key=lambda move: scores[move])
self.model.update(best, guess, features)
i = transition(guess, i, stack, parse)
# Return number correct
return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]])</code></pre>
<p>The most interesting part of the training process is in <code class="language-python">get_gold_moves</code>. The performance of our parser is made possible by an advance by Goldberg and Nivre (2012), who showed that wed been doing this wrong for years.</p>
<p class="box infobox"><strong class="note">2015-08-19 Update:</strong> Interesting, CoreNLP continues to "do it wrong" &ndash; their transition-based parser uses the static-oracle, rather than the dynamic oracle described here. I attribute spaCy's accuracy advantage to this difference in training algorithm. The ClearNLP parser uses an iterative algorithm that achieves the same sort of thing (and was published prior to the dynamic oracle). I find the dynamic oracle idea much more conceptually clear.</p>
<p>In the POS-tagging post, I cautioned that during training you need to make sure you pass in the last two <em>predicted</em> tags as features for the current tag, not the last two <em>gold</em> tags. At test time youll only have the predicted tags, so if you base your features on the gold sequence during training, your training contexts wont resemble your test-time contexts, so youll learn the wrong weights.</p>
<p>In parsing, the problem was that we didnt know <em>how</em> to pass in the predicted sequence! Training worked by taking the gold-standard tree, and finding a transition sequence that led to it. i.e., you got back a sequence of moves, with the guarantee that if you followed those moves, youd get the gold-standard dependencies.</p>
<p>The problem is, we didnt know how to define the “correct” move to teach a parser to make if it was in any state that <em>wasnt</em> along that gold-standard sequence. Once the parser had made a mistake, we didnt know how to train from that example.</p>
<p>That was a big problem, because it meant that once the parser started making mistakes, it would end up in states unlike any in its training data &ndash; leading to yet more mistakes. The problem was specific to greedy parsers: once you use a beam, theres a natural way to do structured prediction.</p>
<p class="box infobox"><strong class="note">2015-08-19 Update:</strong> It's since been pointed out to me that what we're calling a "dynamic oracle" here is really a form of <a href="http://www.ausy.tu-darmstadt.de/Research/ICML2011">imitation learning</a>.</p>
<p>The solution seems obvious once you know it, like all the best breakthroughs. What we do is define a function that asks “How many gold-standard dependencies can be recovered from this state?”. If you can define that function, then you can apply each move in turn, and ask, “How many gold-standard dependencies can be recovered from <em>this</em> state?”. If the action you applied allows <em>fewer</em> gold-standard dependencies to be reached, then it is sub-optimal.</p>
<p>Thats a lot to take in.</p>
<p>So we have this function <code>Oracle(state)</code>:
<pre><code>Oracle(state) = | gold_arcs ∩ reachable_arcs(state) |</code></pre>
</p>
<p>We also have a set of actions, each of which returns a new state. We want to know:</p>
<ul>
<li><code>shift_cost = Oracle(state) Oracle(shift(state))</code></li>
<li><code>right_cost = Oracle(state) Oracle(right(state))</code></li>
<li><code>left_cost = Oracle(state) Oracle(left(state))</code></li>
</ul>
<p>Now, at least one of those costs <em>has</em> to be zero. Oracle(state) is asking, “whats the cost of the best path forward?”, and the first action of that best path has to be shift, right, or left.</p>
<p>It turns out that we can derive Oracle fairly simply for many transition systems. The derivation for the transition system were using, Arc Hybrid, is in Goldberg and Nivre (2013).</p>
<p>Were going to implement the oracle as a function that returns the zero-cost moves, rather than implementing a function Oracle(state). This prevents us from doing a bunch of costly copy operations. Hopefully the reasoning in the code isnt too hard to follow, but you can also consult Goldberg and Nivres papers if youre confused and want to get to the bottom of this.</p>
<pre class="language-python"><code>def get_gold_moves(n0, n, stack, heads, gold):
def deps_between(target, others, gold):
for word in others:
if gold[word] == target or gold[target] == word:
return True
return False
valid = get_valid_moves(n0, n, len(stack))
if not stack or (SHIFT in valid and gold[n0] == stack[-1]):
return [SHIFT]
if gold[stack[-1]] == n0:
return [LEFT]
costly = set([m for m in MOVES if m not in valid])
# If the word behind s0 is its gold head, Left is incorrect
if len(stack) &gt;= 2 and gold[stack[-1]] == stack[-2]:
costly.add(LEFT)
# If there are any dependencies between n0 and the stack,
# pushing n0 will lose them.
if SHIFT not in costly and deps_between(n0, stack, gold):
costly.add(SHIFT)
# If there are any dependencies between s0 and the buffer, popping
# s0 will lose them.
if deps_between(stack[-1], range(n0+1, n-1), gold):
costly.add(LEFT)
costly.add(RIGHT)
return [m for m in MOVES if m not in costly]</code></pre>
<p>Doing this “dynamic oracle” training procedure makes a big difference to accuracy — typically 1-2%, with no difference to the way the run-time works. The old “static oracle” greedy training procedure is fully obsolete; theres no reason to do it that way any more.</p>
<h3>Conclusion</h3>
<p>I have the sense that language technologies, particularly those relating to grammar, are particularly mysterious. I can imagine having no idea what the program might even do.</p>
<p>I think it therefore seems natural to people that the best solutions would be over-whelmingly complicated. A 200,000 line Java package feels appropriate.</p>
<p>But, algorithmic code is usually short, when only a single algorithm is implemented. And when you only implement one algorithm, and you know exactly what you want to write before you write a line, you also dont pay for any unnecessary abstractions, which can have a big performance impact.</p>
<h3>Notes</h3>
<p><a name="note-1"></a> [1] I wasnt really sure how to count the lines of code in the Stanford parser. Its jar file ships over 200k, but there are a lot of different models in it. Its not important, but it's certainly over 4k.</p>
<p><a name="note-2"></a> [2] For instance, how would you parse, “Johns school of music calls”? You want to make sure the phrase “Johns school” has a consistent structure in both “Johns school calls” and “Johns school of music calls”. Reasoning about the different “slots” you can put a phrase into is a key way we reason about what syntactic analyses look like. You can think of each phrase as having a different shaped connector, which you need to plug into different slots — which each phrase also has a certain number of, each of a different shape. Were trying to figure out what connectors are where, so we can figure out how the sentences are put together.</p>
<h3>Idle speculation</h3>
<p>For a long time, incremental language processing algorithms were primarily of scientific interest. If you want to write a parser to test a theory about how the human sentence processor might work, well, that parser needs to build partial interpretations. Theres a wealth of evidence, including commonsense introspection, that establishes that we dont buffer input and analyse it once the speaker has finished.</p>
<p>But now algorithms with that neat scientific feature are winning! As best as I can tell, the secret to that success is to be:</p>
<ul>
<li>Incremental. Earlier words constrain the search.</li>
<li>Error-driven. Training involves a working hypothesis, which is updated as it makes mistakes.</li>
</ul>
<p>The links to human sentence processing seem tantalising. I look forward to seeing whether these engineering breakthroughs lead to any psycholinguistic advances.</p>
<h3>Bibliography</h3>
<p>The NLP literature is almost entirely open access. All of the relavant papers can be found <a href="http://aclweb.org/anthology/" rel="nofollow">here</a>.</p>
<p>The parser Ive described is an implementation of the dynamic-oracle Arc-Hybrid system here:<span class="bib-item">Goldberg, Yoav; Nivre, Joakim. <em>Training Deterministic Parsers with Non-Deterministic Oracles</em>. TACL 2013</span></p>
<p>However, I wrote my own features for it. The arc-hybrid system was originally described here:<span class="bib-item">Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic programming algorithms for transition-based dependency parsers. ACL 2011</span></p>
<p>The dynamic oracle training method was first described here:<span class="bib-item">A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav; Nivre, Joakim. COLING 2012</span></p>
<p>This work depended on a big break-through in accuracy for transition-based parsers, when beam-search was properly explored by Zhang and Clark. They have several papers, but the preferred citation is:<span class="bib-item">Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized Perceptron and Beam Search. Computational Linguistics 2011 (1)</span></p>
<p>Another important paper was this little feature engineering paper, which further improved the accuracy:<span class="bib-item">Zhang, Yue; Nivre, Joakim. Transition-based Dependency Parsing with Rich Non-local Features. ACL 2011</span></p>
<p>The generalised perceptron, which is the learning framework for these beam parsers, is from this paper:<span class="bib-item">Collins, Michael. Discriminative Training Methods for Hidden Markov Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002</span></p>
<h3>Experimental details</h3>
<p>The results at the start of the post refer to Section 22 of the Wall Street Journal corpus. The Stanford parser was run as follows:</p>
<pre class="language-bash"><code>java -mx10000m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \
-outputFormat "penn" edu/stanford/nlp/models/lexparser/englishFactored.ser.gz $*</code></pre>
<p>A small post-process was applied, to undo the fancy tokenisation Stanford adds for numbers, to make them match the PTB tokenisation:</p>
<pre class="language-python"><code>"""Stanford parser retokenises numbers. Split them."""
import sys
import re
qp_re = re.compile('\xc2\xa0')
for line in sys.stdin:
line = line.rstrip()
if qp_re.search(line):
line = line.replace('(CD', '(QP (CD', 1) + ')'
line = line.replace('\xc2\xa0', ') (CD ')
print line</code></pre>
<p>The resulting PTB-format files were then converted into dependencies using the Stanford converter:</p>
<pre class="language-bash"><code>./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp
./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/
./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll</code></pre>
<p>I cant easily read that anymore, but it should just convert every .mrg file in a folder to a CoNLL-format Stanford basic dependencies file, using the settings common in the dependency literature.</p>
<p>I then converted the gold-standard trees from WSJ 22, for the evaluation. Accuracy scores refer to unlabelled attachment score (i.e. the head index) of all non-punctuation tokens.</p>
<p>To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21 into the same conversion script.</p>
<p>In a nutshell: The Stanford model and parser.py are trained on the same set of sentences, and they each make their predictions on a held-out test set, for which we know the answers. Accuracy refers to how many of the words heads we got correct.</p>
<p>Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a server, to give the Stanford parser more memory. The parser.py system runs fine on my MacBook Air. I used PyPy for the parser.py experiments; CPython was about half as fast on an early benchmark.</p>
<p>One of the reasons parser.py is so fast is that it does unlabelled parsing. Based on previous experiments, a labelled parser would likely be about 40x slower, and about 1% more accurate. Adapting the program to labelled parsing would be a good exercise for the reader, if you have access to the data.</p>
<p>The result from the Redshift parser was produced from commit <code class="language-python">b6b624c9900f3bf</code>, which was run as follows:</p>
<pre class="language-bash"><code>./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp
./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/
./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll</code></pre>
<footer role="contentinfo" class="meta"><a href="http://twitter.com/share?text=Parsing English in 500 lines of Python&amp;url=http://spacy.io/blog/parsing-english-in-python&amp;via=spacy_io" title="Share on Twitter" target="_blank" class="button button-twitter">Share on Twitter </a>
<div class="discuss"> <a href="https://www.reddit.com/r/programming/comments/245jte/parsing_english_with_500_lines_of_python/" title="Discuss on Reddit" class="button button-reddit">Reddit Thread</a> <a href="https://news.ycombinator.com/item?id=7658864" title="Discuss on Hacker News Thread" class="button button-hn">Hacker News</a>
</div>
<section class="intro profile">
<p><img src="/resources/img/matt.png"> Matthew Honnibal is the author of the spaCy software and the sole founder of its parent company. He studied linguistics as an undergrad, and never thought he'd be a programmer. By 2009 he had a PhD in computer science, and in 2014 he left academia to found Syllogism Co. He's from Sydney and lives in Berlin. <span class="social"><a href="//twitter.com/honnibal" target="_blank">Twitter</a></span></p>
</section>
</footer>
</article>
</main>
<script src="/resources/js/prism.min.js"></script>
<!-- Details polyfill-->
<script>
var details = document.getElementsByTagName("details");
var summary = document.getElementsByTagName("summary");
for(var i = 0; i < details.length; i++) {
(details[i].getAttribute("open") == null) ? details[i].setAttribute("data-open", "false") : details[i].setAttribute("data-open", "true");
}
for(var i = 0; i < summary.length; i++) {
summary[i].addEventListener( "click", function(e) {
var parent = this.parentElement;
(parent.getAttribute("data-open") == "false") ? parent.setAttribute("data-open", "true") : parent.setAttribute("data-open", "false");
});
}
</script>
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-58931649-1', 'auto');
ga('send', 'pageview');
</script>
<footer role="contentinfo"><span class="slogan copyright">&copy; 2015 Syllogism Co. | <a href="mailto:contact@spacy.io">Contact</a></span></footer>
</body>
</html>

View File

@ -1,606 +0,0 @@
<!-- TODO-->
<!-- Doc-->
<!-- to_array-->
<!-- count_by-->
<!-- from_array-->
<!-- from_bytes-->
<!-- to_bytes-->
<!-- read_bytes-->
<!-- -->
<!-- Token-->
<!-- Constructors-->
<!-- Examples for repvec. Rename?-->
<!-- Link Simple Good Turing in prob-->
<!---->
<!-- Span-->
<!-- Constructors-->
<!-- Convert details to Define lists-->
<!-- Styling of elements in Parse. Improve Span.root documentation-->
<!-- -->
<!-- Lexeme-->
<!-- Constructors-->
<!---->
<!-- Vocab-->
<!-- Constructors-->
<!---->
<!-- StringStore-->
<!-- Constructors-->
<details>
<summary><a name="pipeline"><span class="declaration"><span class="label">class</span><code>English</code></span></a></summary>
<p>Load models into a callable object to process English text. Intended use is for one instance to be created per process. You can create more if you're doing something unusual. You may wish to make the instance a global variable or "singleton". We usually instantiate the object in the <code>main()</code> function and pass it around as an explicit argument. </p>
<pre class="language-python"><code>from spacy.en import English
from spacy._doc_examples import download_war_and_peace
unprocessed_unicode = download_war_and_peace()
nlp = English()
doc = nlp(unprocessed_unicode)</code></pre>
<details open="open">
<summary><a><span class="declaration"><code>__init__</code><span class="parameters">self, data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True</span></span></a></summary>
<p>Load the resources. Loading takes 20 seconds, and the instance consumes 2 to 3 gigabytes of memory.</p>
<p>Load data from default directory:</p>
<pre class="language-python"><code>>>> nlp = English()
>>> nlp = English(data_dir=u'')</code></pre>
<p>Load data from specified directory:</p>
<pre class="language-python"><code>>>> nlp = English(data_dir=u'path/to/data_directory')</code></pre>
<p>Disable (and avoid loading) parts of the processing pipeline:</p>
<pre class="language-python"><code>>>> nlp = English(load_vectors=False, Parser=False, Tagger=False, Entity=False)</code></pre>
<p>Start with nothing loaded:</p>
<pre class="language-python"><code>>>> nlp = English(data_dir=None)</code></pre>
<ul>
<li><strong>data_dir</strong> &#8211;
The data directory. May be , to disable any data loading (including the vocabulary).
</li>
<li><strong>Tagger</strong> &#8211; A class/function that creates the part-of-speech tagger. Usually this is left <code>True</code>, to load the default tagger. If falsey, no tagger is loaded.
<p>You can also supply your own class/function, which will be called once on setup. The returned function will then be called in <code>English.__call__</code>. The function passed must accept two arguments, of types <code>(StringStore, directory)</code>, and produce a function that accepts one argument, of type <code>Doc</code>. Its return type is unimportant.</p>
</li>
<li><strong>Parser</strong> &#8211; A class/function that creates the syntactic dependency parser. Usually this is left <code>True</code>, to load the default tagger. If falsey, no parser is loaded.
<p>You can also supply your own class/function, which will be called once on setup. The returned function will then be called in <code>English.__call__</code>. The function passed must accept two arguments, of types <code>(StringStore, directory)</code>, and produce a function that accepts one argument, of type <code>Doc</code>. Its return type is unimportant.</p>
</li>
<li><strong>Entity</strong> &#8211; A class/function that creates the named entity recogniser. Usually this is left <code>True</code>, to load the default tagger. If falsey, no entity recognizer is loaded.
<p>You can also supply your own class/function, which will be called once on setup. The returned function will then be called in <code>English.__call__</code>. The function passed must accept two arguments, of types <code>(StringStore, directory)</code>, and produce a function that accepts one argument, of type <code>Doc</code>. Its return type is unimportant.</p>
</li>
<li><strong>load_vectors</strong> &#8211;
A boolean value to control whether the word vectors are loaded.
</li>
</ul>
</details>
<details open="true">
<summary><a name="English-__call__"><span class="declaration"><code>__call__</code><span class="parameters">text, tag=True, parse=True, entity=True</span></span></a></summary>
<p>The main entry point to spaCy. Takes raw unicode text, and returns a <code>Doc</code> object, which can be iterated to access <code>Token</code> and <code>Span</code> objects. spaCy's models are all linear-time, so you can supply documents of arbitrary length, e.g. whole novels.</p>
<ul>
<li><strong>text</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>) &#8211;The text to be processed. spaCy expects raw unicode txt &ndash; you don't necessarily need to, say, split it into paragraphs. However, depending on your documents, you might be better off applying custom pre-processing. Non-text formatting, e.g. from HTML mark-up, should be removed before sending the document to spaCy. If your documents have a consistent format, you may be able to improve accuracy by pre-processing. For instance, if the first word of your documents are always in upper-case, it may be helpful to normalize them before supplying them to spaCy.
</li>
<li><strong>tag</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#bool"><em>bool</em></a>) &#8211;Whether to apply the part-of-speech tagger. Required for parsing and entity recognition.
</li>
<li><strong>parse</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#bool"><em>bool</em></a>) &#8211; Whether to apply the syntactic dependency parser.
</li>
<li><strong>entity</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#bool"><em>bool</em></a>) &#8211;Whether to apply the named entity recognizer.
</li>
</ul>
<pre class="language-python"><code>from spacy.en import English
nlp = English()
doc = nlp(u'Some text.) # Applies tagger, parser, entity
doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
doc = nlp(u'') # Zero-length tokens, not an error
# doc = nlp(b'Some text') <-- Error: need unicode
doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.</code></pre>
</details>
</details>
<details>
<summary><a name="doc"><span class="declaration"><span class="label">class</span><code>Doc</code></span></a></summary>
<p>A sequence of <code>Token</code> objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary strings.</p>
<p>Internally, the <code>Doc</code> object holds an array of <code>TokenC</code> structs. The Python-level <code>Token</code> and <code>Span</code> objects are views of this array, i.e. they don't own the data themselves. This details of the internals shouldn't matter for the API &ndash; but it may help you read the code, and understand how spaCy is designed.</p>
<details>
<summary>
<h4>Constructors</h4>
</summary><a href="#English-__call__"><span class="declaration"><span class="label">via</span><code>English.__call__(unicode text)</code></span></a>
<details>
<summary><a><span class="declaration"><code>__init__</code><span class="parameters">self, vocab, orth_and_spaces=None</span></span></a></summary> This method of constructing a <code>Doc</code> object is usually only used for deserialization. Standard usage is to construct the document via a call to the language object.
<ul>
<li><strong>vocab</strong> &#8211; A Vocabulary object, which must match any models you want to use (e.g. tokenizer, parser, entity recognizer).
</li>
<li><strong>orth_and_spaces</strong> &#8211; A list of <code>(orth_id, has_space)</code> tuples, where <code>orth_id</code> is an integer, and has_space is a boolean, indicating whether the token has a trailing space.
</li>
</ul>
</details>
</details>
<details>
<summary>
<h4>Sequence API</h4>
</summary>
<li><span class="declaration"><code>doc[i]</code></span> Get the <code>Token</code> object at position <code>i</code>, where <code>i</code> is an integer. Negative indexing is supported, and follows the usual Python semantics, i.e. <code>doc[-2]</code> is <code>doc[len(doc) - 2]</code>.
</li>
<li><span class="declaration"><code>doc[start : end]</code></span> Get a <code>Span</code> object, starting at position <code>start</code> and ending at position <code>end</code>. For instance, <code>doc[2:5]</code> produces a span consisting of tokens 2, 3 and 4. Stepped slices (e.g. <code>doc[start : end : step]</code>) are not supported, as <code>Span</code> objects must be contiguous (cannot have gaps).
</li>
<li><span class="declaration"><code>for token in doc</code></span>Iterate over <code>Token </code> objects, from which the annotations can be easily accessed. This is the main way of accessing <code>Token</code> objects, which are the main way annotations are accessed from Python. If faster-than-Python speeds are required, you can instead access the annotations as a numpy array, or access the underlying C data directly from Cython, via <code>Doc.data</code>, an array of <code>TokenC</code> structs. The C API has not yet been finalized, and is subject to change.
</li>
<li><span class="declaration"><code>len(doc)</code></span> The number of tokens in the document.
</li>
</details>
<details>
<summary>
<h4>Sentence, entity and noun chunk spans</h4>
</summary>
<details>
<summary><span class="declaration"><code>sents</code></span></summary>
<p> Yields sentence <code>Span</code> objects. Iterate over the span to get individual <code>Token</code> objects. Sentence spans have no label.
<pre class="language-python"><code>>>> from spacy.en import English
>>> nlp = English()
>>> doc = nlp(u'This is a sentence. Here's another...')
>>> for sentence in doc.sents:
... sentence.root.orth_
is
's</code></pre>
</p>
</details>
<details>
<summary><span class="declaration"><code>ents</code></span></summary>
<p> Yields named-entity <code>Span</code> objects. Iterate over the span to get individual <code>Token</code> objects, or access the label:
<pre><code>>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents)
>>> ents[0].label, ents[0].label_, ents[0].orth_, ents[0].string
(112504, 'PERSON', 'Best', ents[0].string) </code></pre>
</p>
</details>
<details>
<summary><span class="declaration"><code>noun_chunks</code></span></summary>
<p> Yields base noun-phrase <code>Span </code> objects. A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it &ndash; so no NP-level coordination, no prepositional phrases, and no relative clauses. For example:
<pre class="language-python"><code>>>> from spacy.en import English
>>> nlp = English()
>>> doc = nlp('The sentence in this example has three noun chunks.')
>>> for chunk in doc.noun_chunks:
... print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
NP The sentence <-- has
NP this example <-- in
NP three noun chunks <-- has</code></pre>
</p>
</details>
</details>
<details>
<summary>
<h4>Export/Import</h4>
</summary>
<details>
<summary><a><span class="declaration"><code>to_array</code><span class="parameters">attr_ids</span></span></a></summary>Given a list of M attribute IDs, export the tokens to a numpy ndarray of shape N*M, where N is the length of the sentence.
<ul>
<li><strong>attr_ids</strong> (list[int]) &#8211;A list of attribute ID ints. Attribute IDs can be imported from <code>spacy.attrs</code>
</li>
</ul>
</details>
<details>
<summary><a><span class="declaration"><code>count_by</code><span class="parameters">attr_id</span></span></a></summary>Produce a dict of <code>{attribute (int): count (ints)}</code> frequencies, keyed by the values of the given attribute ID.
<pre class="language-python"><code>>>> from spacy.en import English, attrs
>>> nlp = English()
>>> tokens = nlp(u'apple apple orange banana')
>>> tokens.count_by(attrs.ORTH)
{12800L: 1, 11880L: 2, 7561L: 1}
>>> tokens.to_array([attrs.ORTH])
array([[11880],
[11880],
[7561],
[12800]])</code></pre>
</details>
<details>
<summary><a><span class="declaration"><code>from_array</code><span class="parameters">attrs, array</span></span></a></summary>Write to a <code>Doc</code> object, from an M*N array of attributes.
</details>
<details>
<summary><a><span class="declaration"><code>from_bytes</code><span class="parameters"></span></span></a></summary>Deserialize, loading from bytes.
</details>
<details>
<summary><a><span class="declaration"><code>to_bytes</code><span class="parameters"></span></span></a></summary>Serialize, producing a byte string.
</details>
<details>
<summary><a><span class="declaration"><code>read_bytes</code><span class="parameters"></span></span></a></summary>classmethod
</details>
</details>
</details>
<details>
<summary><a name="token"><span class="declaration"><span class="label">class</span><code>Token</code></span></a></summary>A Token represents a single word, punctuation or significant whitespace symbol. Integer IDs are provided for all string features. The (unicode) string is provided by an attribute of the same name followed by an underscore, e.g. <code>token.orth</code> is an integer ID, <code>token.orth_</code> is the unicode value. The only exception is the Token.string attribute, which is (unicode) string-typed.
<details>
<summary>
<h4>String Features</h4>
</summary>
<ul>
<li><span class="declaration"><code>lemma / lemma_</code></span>The "base" of the word, with no inflectional suffixes, e.g. the lemma of "developing" is "develop", the lemma of "geese" is "goose", etc. Note that <em>derivational</em> suffixes are not stripped, e.g. the lemma of "instutitions" is "institution", not "institute". Lemmatization is performed using the WordNet data, but extended to also cover closed-class words such as pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his". We assign pronouns the lemma <code>-PRON-</code>.
</li>
</ul>
<ul>
<li><span class="declaration"><code>orth / orth_</code></span>The form of the word with no string normalization or processing, as it appears in the string, without trailing whitespace.
</li>
<li><span class="declaration"><code>lower / lower_</code></span>The form of the word, but forced to lower-case, i.e. <code class="language-python">lower = word.orth_.lower()</code>
</li>
<li><span class="declaration"><code>shape / shape_</code></span>A transform of the word's string, to show orthographic features. The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. After these mappings, sequences of 4 or more of the same character are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, :) --> :)
</li>
<li><span class="declaration"><code>prefix / prefix_</code></span>A length-N substring from the start of the word. Length may vary by language; currently for English n=1, i.e. <code class="language-python">prefix = word.orth_[:1]</code>
</li>
<li><span class="declaration"><code>suffix / suffix_</code></span>A length-N substring from the end of the word. Length may vary by language; currently for English n=3, i.e. <code class="language-python">suffix = word.orth_[-3:]</code>
</li>
</ul>
</details>
<details>
<summary>
<h4>Boolean Flags</h4>
</summary>
<ul>
<li><span class="declaration"><code>is_alpha</code></span> Equivalent to <code class="language-python">word.orth_.isalpha()</code>
</li>
<li><span class="declaration"><code>is_ascii</code></span> Equivalent to <code class="language-python">any(ord(c) >= 128 for c in word.orth_)</code>
</li>
<li><span class="declaration"><code>is_digit</code></span> Equivalent to <code class="language-python">word.orth_.isdigit()</code>
</li>
<li><span class="declaration"><code>is_lower</code></span> Equivalent to <code class="language-python">word.orth_.islower()</code>
</li>
<li><span class="declaration"><code>is_title</code></span> Equivalent to <code class="language-python">word.orth_.istitle()</code>
</li>
<li><span class="declaration"><code>is_punct</code></span> Equivalent to <code class="language-python">word.orth_.ispunct()</code>
</li>
<li><span class="declaration"><code>is_space</code></span> Equivalent to <code class="language-python">word.orth_.isspace()</code>
</li>
<li><span class="declaration"><code>like_url</code></span> Does the word resembles a URL?
</li>
<li><span class="declaration"><code>like_num</code></span> Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc
</li>
<li><span class="declaration"><code>like_email</code></span> Does the word resemble an email?
</li>
<li><span class="declaration"><code>is_oov</code></span> Is the word out-of-vocabulary?
</li>
</ul>
<details>
<summary><a><span class="declaration"><code>check_flag</code><span class="parameters">flag_id</span></span></a></summary>Get the value of one of the boolean flags
</details>
</details>
<details>
<summary>
<h4>Distributional Features</h4>
</summary>
<ul>
<li><span class="declaration"><code>prob</code></span> The unigram log-probability of the word, estimated from counts from a large corpus, smoothed using Simple Good Turing estimation.
</li>
<li><span class="declaration"><code>cluster</code></span> The Brown cluster ID of the word. These are often useful features for linear models. If youre using a non-linear model, particularly a neural net or random forest, consider using the real-valued word representation vector, in Token.repvec, instead.
</li>
<li><span class="declaration"><code>repvec</code></span> A “word embedding” representation: a dense real-valued vector that supports similarity queries between words. By default, spaCy currently loads vectors produced by the Levy and Goldberg (2014) dependency-based word2vec model.
</li>
</ul>
</details>
<details>
<summary>
<h4>Alignment and Output</h4>
</summary>
<ul>
<li><span class="declaration"><code>idx</code></span>Start index of the token in the string
</li>
<li><span class="declaration"><code>len(token)</code></span>Length of the token's orth string, in unicode code-points.
</li>
<li><span class="declaration"><code>unicode(token)</code></span>Same as token.orth_
</li>
<li><span class="declaration"><code>str(token)</code></span>In Python 3, returns <code>token.orth_</code>. In Python 2, returns<code>token.orth_.encode('utf8')</code>
</li>
<li><span class="declaration"><code>string</code></span><code>token.orth_ + token.whitespace_</code>, i.e. the form of the word as it appears in the string,
<including>trailing whitespace</including>. This is useful when you need to use linguistic features to add inline mark-up to the string.
</li>
<li><span class="declaration"><code>whitespace_</code></span>The number of immediate syntactic children following the word in the string.
</li>
</ul>
<define>
<summary>
<h4>Navigating the Parse Tree</h4>
</summary>
<li><span class="declaration"><code>head</code></span>The immediate syntactic head of the token. If the token is the root of its sentence, it is the token itself, i.e. <code>root_token.head is root_token</code>
</li>
<li><span class="declaration"><code>children</code></span>An iterator that yields from lefts, and then yields from rights.
</li>
<li><span class="declaration"><code>subtree</code></span>An iterator for the part of the sentence syntactically governed by the word, including the word itself.
</li>
<li><span class="declaration"><code>left_edge</code></span>The leftmost edge of the token's subtree
</li>
<li><span class="declaration"><code>right_edge</code></span>The rightmost edge of the token's subtree
</li>
</define>
<details>
<summary><a><span class="declaration"><code>nbor(i=1)</code><span class="parameters"></span></span></a></summary>Get the <em>i</em>th next / previous neighboring token.
</details>
</details>
<details>
<summary>
<h4>Named Entities</h4>
</summary>
<ul>
<li><span class="declaration"><code>ent_type</code></span>If the token is part of an entity, its entity type.
</li>
<li><span class="declaration"><code>ent_iob</code></span>The IOB (inside, outside, begin) entity recognition tag for the token.
</li>
</ul>
</details>
<details>
<summary>
<h4>Constructors</h4>
</summary>
<details>
<summary><a><span class="declaration"><code>__init__</code><span class="parameters">vocab, doc, offset</span></span></a></summary>
<ul>
<li><strong>vocab</strong> &#8211;A Vocab object
</li>
<li><strong>doc</strong> &#8211;The parent sequence
</li>
<li><strong>offset</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#int"><em>int</em></a>) &#8211;The index of the token within the document
</li>
</ul>
</details>
<!--+attribute("conjuncts")-->
<!-- | Conjuncts-->
</details>
</details>
<details>
<summary><a name="span"><span class="declaration"><span class="label">class</span><code>Span</code></span></a></summary>A <code>Span</code> is a slice of a <code>Doc</code> object, consisting of zero or more tokens. Spans are used to represent sentences, named entities, phrases, and arbitrary contiguous slices from the <code>Doc</code> object. <code>Span</code> objects are views &ndash; that is, they do not copy the underlying C data. This makes them cheap to construct, as internally are simply a reference to the <code>Doc</code> object, a start position, an end position, and a label ID.
<li><span class="declaration"><code>token = span[i]</code></span>Get the <code>Token</code> object at position <em>i</em>, where <em>i</em> is an offset within the <code>Span</code>, not the document. That is:
<pre class="language-python"><code>span = doc[4:6]
token = span[0]
assert token.i == 4</code></pre>
</li>
<ul>
<li><span class="declaration"><code>for token in span</code></span>Iterate over the <code>Token</code> objects in the span.
</li>
<li><span class="declaration"><code>__len__</code></span>Number of tokens in the span.
</li>
<li><span class="declaration"><code>start</code></span>The start offset of the span, i.e. <code class="language-python">span[0].i</code>.
</li>
<li><span class="declaration"><code>end</code></span>The end offset of the span, i.e. <code class="language-python">span[-1].i + 1</code>
</li>
</ul>
<details>
<summary>
<h4>Navigating the Parse Tree</h4>
</summary>
<ul>
<li><span class="declaration"><code>root</code></span>The first ancestor of the first word of the span that has its head outside the span. For example:
<pre class="language-python"><code>>>> toks = nlp(u'I like New York in Autumn.')</code></pre>
<p>Let's name the indices --- easier than writing <code>toks[4]</code> etc.</p>
<pre class="language-python"><code>>>> i, like, new, york, in_, autumn, dot = range(len(toks)) </code></pre>
<p>The head of <em>new</em> is <em>York</em>, and the head of <em>York</em> is <em>like</em></p>
<pre class="language-python"><code>>>> toks[new].head.orth_
'York'
>>> toks[york].head.orth_
'like'</code></pre>
<p>Create a span for "New York". Its root is "York".</p>
<pre class="language-python"><code>>>> new_york = toks[new:york+1]
>>> new_york.root.orth_
'York'</code></pre>
<p>When there are multiple words with external dependencies, we take the first:</p>
<pre class="language-python"><code>>>> toks[autumn].head.orth_, toks[dot].head.orth_
('in', like')
>>> autumn_dot = toks[autumn:]
>>> autumn_dot.root.orth_
'Autumn'</code></pre>
</li>
<li><span class="declaration"><code>lefts</code></span>Tokens that are to the left of the span, whose head is within the span, i.e. <code class="language-python">
lefts = [span.doc[i] for i in range(0, span.start)
if span.doc[i].head in span]</code>
</li>
<li><span class="declaration"><code>rights</code></span>Tokens that are to the right of the span, whose head is within the span, i.e.
<pre class="language-python"><code>rights = [span.doc[i] for i in range(span.end, len(span.doc))
if span.doc[i].head in span]</code></pre>
</li>
</ul>
<li><span class="declaration"><code>subtree</code></span>Tokens in the range <code>(start, end+1)</code>, where <code>start</code> is the index of the leftmost word descended from a token in the span, and <code>end</code> is the index of the rightmost token descended from a token in the span.
</li>
</details>
<details>
<summary>
<h4>Constructors</h4>
</summary>
<ul>
<li><span class="declaration"><code>doc[start : end]</code></span>
</li>
<li><span class="declaration"><code>for entity in doc.ents</code></span>
</li>
<li><span class="declaration"><code>for sentence in doc.sents</code></span>
</li>
<li><span class="declaration"><code>for noun_phrase in doc.noun_chunks</code></span>
</li>
<li><span class="declaration"><code>span = Span(doc, start, end, label=0)</code></span>
</li>
</ul>
<details>
<summary><a><span class="declaration"><code>__init__</code><span class="parameters"></span></span></a></summary>Temp <code>span = doc[0:4]</code>
</details>
</details>
<details>
<summary>
<h4>String Views</h4>
</summary>
<details open="open">
<summary><span class="declaration"><code>string</code></span></summary>
<p>String
</p>
</details>
<details open="open">
<summary><span class="declaration"><code>lemma / lemma_</code></span></summary>
<p>String
</p>
</details>
<details open="open">
<summary><span class="declaration"><code>label / label_</code></span></summary>
<p>String
</p>
</details>
</details>
</details>
<details>
<summary><a name="lexeme"><span class="declaration"><span class="label">class</span><code>Lexeme</code></span></a></summary>
<p>The Lexeme object represents a lexical type, stored in the vocabulary &ndash; as opposed to a token, occurring in a document.</p>
<p>Lexemes store various features, so that these features can be computed once per type, rather than once per token. As job sizes grow, this can amount to a substantial efficiency improvement.</p>
<p>All Lexeme attributes are therefore context independent, as a single lexeme is reused for all usages of that word. Lexemes are keyed by the “orth” attribute. </p>
<p>All Lexeme attributes are accessible directly on the Token object.</p>
<details>
<summary>
<h4>String Features</h4>
</summary>
<ul>
<li><span class="declaration"><code>orth / orth_</code></span>The form of the word with no string normalization or processing, as it appears in the string, without trailing whitespace.
</li>
<li><span class="declaration"><code>lower / lower_</code></span>The form of the word, but forced to lower-case, i.e. <code class="language-python">lower = word.orth_.lower()</code>
</li>
<li><span class="declaration"><code>shape / shape_</code></span>A transform of the word's string, to show orthographic features. The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. After these mappings, sequences of 4 or more of the same character are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, :) --> :)
</li>
<li><span class="declaration"><code>prefix / prefix_</code></span>A length-N substring from the start of the word. Length may vary by language; currently for English n=1, i.e. <code class="language-python">prefix = word.orth_[:1]</code>
</li>
<li><span class="declaration"><code>suffix / suffix_</code></span>A length-N substring from the end of the word. Length may vary by language; currently for English n=3, i.e. <code class="language-python">suffix = word.orth_[-3:]</code>
</li>
</ul>
</details>
<details>
<summary>
<h4>Boolean Features</h4>
</summary>
<ul>
<li><span class="declaration"><code>is_alpha</code></span> Equivalent to <code class="language-python">word.orth_.isalpha()</code>
</li>
<li><span class="declaration"><code>is_ascii</code></span> Equivalent to <code class="language-python">any(ord(c) >= 128 for c in word.orth_)</code>
</li>
<li><span class="declaration"><code>is_digit</code></span> Equivalent to <code class="language-python">word.orth_.isdigit()</code>
</li>
<li><span class="declaration"><code>is_lower</code></span> Equivalent to <code class="language-python">word.orth_.islower()</code>
</li>
<li><span class="declaration"><code>is_title</code></span> Equivalent to <code class="language-python">word.orth_.istitle()</code>
</li>
<li><span class="declaration"><code>is_punct</code></span> Equivalent to <code class="language-python">word.orth_.ispunct()</code>
</li>
<li><span class="declaration"><code>is_space</code></span> Equivalent to <code class="language-python">word.orth_.isspace()</code>
</li>
<li><span class="declaration"><code>like_url</code></span> Does the word resembles a URL?
</li>
<li><span class="declaration"><code>like_num</code></span> Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc
</li>
<li><span class="declaration"><code>like_email</code></span> Does the word resemble an email?
</li>
<li><span class="declaration"><code>is_oov</code></span> Is the word out-of-vocabulary?
</li>
</ul>
</details>
<details>
<summary>
<h4>Distributional Features</h4>
</summary>
<ul>
<li><span class="declaration"><code>prob</code></span> The unigram log-probability of the word, estimated from counts from a large corpus, smoothed using Simple Good Turing estimation.
</li>
<li><span class="declaration"><code>cluster</code></span> The Brown cluster ID of the word. These are often useful features for linear models. If youre using a non-linear model, particularly a neural net or random forest, consider using the real-valued word representation vector, in Token.repvec, instead.
</li>
<li><span class="declaration"><code>repvec</code></span> A “word embedding” representation: a dense real-valued vector that supports similarity queries between words. By default, spaCy currently loads vectors produced by the Levy and Goldberg (2014) dependency-based word2vec model.
</li>
</ul>
</details>
<details>
<summary>
<h4>Constructors</h4>
</summary>
<details open="open">
<summary><a><span class="declaration"><code>__init__</code><span class="parameters"></span></span></a></summary>
<p>Init</p>
</details>
</details>
</details>
<details>
<summary><a><span class="declaration"><span class="label">class</span><code>Vocab</code></span></a></summary>
<ul>
<li><span class="declaration"><code>lexeme = vocab[integer_id]</code></span>Get a lexeme by its orth ID
</li>
<li><span class="declaration"><code>lexeme = vocab[string]</code></span>Get a lexeme by the string corresponding to its orth ID.
</li>
<li><span class="declaration"><code>for lexeme in vocab</code></span>Iterate over <code>Lexeme</code> objects
</li>
<li><span class="declaration"><code>vocab[integer_id] = attributes_dict</code></span>A props dictionary
</li>
<li><span class="declaration"><code>len(vocab)</code></span>Number of lexemes (unique words) in the
</li>
</ul>
<details>
<summary>
<h4>Constructors</h4>
</summary>
<details open="open">
<summary><a><span class="declaration"><code>__init__</code><span class="parameters"></span></span></a></summary>Tmp
</details>
</details>
<details>
<summary>
<h4>Save and Load</h4>
</summary>
<details open="open">
<summary><a><span class="declaration"><code>dump</code><span class="parameters">loc</span></span></a></summary>
<ul>
<li><strong>loc</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>) &#8211;Path where the vocabulary should be saved
</li>
</ul>
</details>
<details open="open">
<summary><a><span class="declaration"><code>load_lexemes</code><span class="parameters">loc</span></span></a></summary>
<ul>
<li><strong>loc</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>) &#8211;Path to load the lexemes.bin file from
</li>
</ul>
</details>
<details open="open">
<summary><a><span class="declaration"><code>load_vectors</code><span class="parameters">loc</span></span></a></summary>
<ul>
<li><strong>loc</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>) &#8211;Path to load the vectors.bin from
</li>
</ul>
</details>
</details>
</details>
<details>
<summary><a><span class="declaration"><span class="label">class</span><code>StringStore</code></span></a></summary>
<p>Intern strings, and map them to sequential integer IDs. The mapping table is very efficient , and a small-string optimization is used to maintain a small memory footprint. Only the integer IDs are held by spaCy's data classes (<code>Doc</code>, <code>Token</code>, <code>Span</code> and <code>Lexeme</code>) &ndash; when you use a string-valued attribute like <code>token.orth_</code>, you access a property that computes <code>token.strings[token.orth]</code>.</p>
<ul>
<li><span class="declaration"><code>string = string_store[int_id]</code></span>Retrieve a string from a given integer ID. If the integer ID is not found, raise <code>IndexError</code>
</li>
<li><span class="declaration"><code>int_id = string_store[unicode_string]</code></span> Map a unicode string to an integer ID. If the string is previously unseen, it is interned, and a new ID is returned.
</li>
<li><span class="declaration"><code>int_id = string_store[utf8_byte_string]</code></span> Byte strings are assumed to be in UTF-8 encoding. Strings encoded with other codecs may fail silently. Given a utf8 string, the behaviour is the same as for unicode strings. Internally, strings are stored in UTF-8 format. So if you start with a UTF-8 byte string, it's less efficient to first decode it as unicode, as StringStore will then have to encode it as UTF-8 once again.
</li>
<li><span class="declaration"><code>n_strings = len(string_store)</code></span>Number of strings in the string-store
</li>
<li><span class="declaration"><code>for string in string_store</code></span>Iterate over strings in the string store, in order, such that the <em>i</em>th string in the sequence has the ID <em>i</em>:
<pre class="language-python"><code>for i, string in enumerate(string_store):
assert i == string_store[string]</code></pre>
</li>
</ul>
<details>
<summary>
<h4>Constructors</h4>
</summary>
<p><code>StringStore.__init__</code> takes no arguments, so a new instance can be constructed as follows:</p>
<pre class="language-python"><code>string_store = StringStore()</code></pre>
<p>However, in practice you'll usually use the instance owned by the language's <code>vocab</code> object, which all classes hold a reference to:</p>
<ul>
<li><code class="language-python">english.vocab.strings</code></li>
<li><code class="language-python">doc.vocab.strings</code></li>
<li><code class="language-python">span.vocab.strings</code></li>
<li><code class="language-python">token.vocab.strings</code></li>
<li><code class="language-python">lexeme.vocab.strings</code></li>
</ul>
<p>If you create another instance, it will map strings to different integers &ndash; which is usually not what you want.</p>
</details>
<details>
<summary>
<h4>Save and Load</h4>
</summary>
<details open="open">
<summary><a><span class="declaration"><code>dump</code><span class="parameters">loc</span></span></a></summary>
<p>Save the strings mapping to the given location, in plain text. The format is subject to change; so if you need to read/write compatible files, please can find details in the <code>strings.pyx</code> source.</p>
</details>
<details open="open">
<summary><a><span class="declaration"><code>load</code><span class="parameters">loc</span></span></a></summary>
<p>Load the strings mapping from a plain-text file in the given location. The format is subject to change; so if you need to read/write compatible files, please can find details in the <code>strings.pyx</code> source.</p>
</details>
</details>
</details>

View File

@ -87,8 +87,7 @@ mixin SeeAlso(name, link_target)
mixin Define(term)
li
#[span.declaration #[code #{term}]]
li #[span.declaration #[code #{term}]]
block

View File

@ -11,8 +11,7 @@ mixin row(...cells)
mixin Define(term)
li
#[code #{term}]
li #[code #{term}]
block

View File

@ -99,7 +99,7 @@ mixin WritePage(Site, Author, Page)
meta(property="article:published_time" content=getDate(Page.date).timestamp)
link(rel="stylesheet" href="/resources/css/style.css")
//[if lt IE 9]><script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]
//[if lt IE 9]><script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]
body(id=Page.type)
header(role="banner")
@ -114,8 +114,7 @@ mixin WritePage(Site, Author, Page)
nav(role="navigation")
li(class={active: Page.active.home}): a(href="/") Home
li(class={active: Page.active.docs}): a(href="/docs") Docs
li: a(href="/displacy", target="_blank") Demo
//li(class={active: Page.active.license}): a(href="/license") License
li: a(href="http://api.spacy.io/displacy", target="_blank") Demo
li(class={active: Page.active.blog}): a(href="/blog") Blog
main#content
block

View File

@ -7,10 +7,9 @@ mixin Option(name, open)
+Option("Updating your installation")
| To update your installation:
pre.language-bash
code
$ pip install --upgrade spacy
$ python -m spacy.en.download --force all
pre.language-bash: code
| $ pip install --upgrade spacy
| $ python -m spacy.en.download --force all
p Most updates ship a new model, so you will usually have to redownload the data.

View File

@ -1,5 +1,5 @@
mixin Displacy(sentence, caption_text, height)
- var url = "/displacy/?full=" + sentence.replace(" ", "%20")
- var url = "http://api.spacy.io/displacy/?full=" + sentence.replace(" ", "%20")
.displacy
iframe.displacy(src="/resources/displacy/displacy_demo.html" height=height)
@ -17,6 +17,6 @@ mixin Displacy(sentence, caption_text, height)
275
)
p #[a(href="/displacy") displaCy] lets you peek inside spaCy's syntactic parser, as it reads a sentence word-by-word. By repeatedly choosing from a small set of actions, it links the words together according to their syntactic structure. This type of representation powers a wide range of technologies, from translation and summarization, to sentiment analysis and algorithmic trading. #[a(href="/blog/displacy") Read more.]
p #[a(href="http://api.spacy.io/displacy") displaCy] lets you peek inside spaCy's syntactic parser, as it reads a sentence word-by-word. By repeatedly choosing from a small set of actions, it links the words together according to their syntactic structure. This type of representation powers a wide range of technologies, from translation and summarization, to sentiment analysis and algorithmic trading. #[a(href="/blog/displacy") Read more.]

View File

@ -29,10 +29,10 @@ include ../header.jade
li: a.button(href="#example-use") Examples
li: a.button(href="#install")
| Install
<span class="button-caption">v0.97</span>
<span class="button-caption">v0.99</span>
article.page.landing-page
+Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade")
+Section("Online Demo", "online-demo", "./_online_demo.jade")
+Section("Usage by Example", "example-use", "./_usage_examples.jade")
+Section("Install v0.97", "install", "./_installation.jade")
+Section("Install v0.99", "install", "./_installation.jade")

View File

@ -117,8 +117,8 @@ include ./meta.jade
details: summary: h4 Create frequencies
pre.language-bash: code
$ python bin/get_freqs.py
$ python bin/gather_freqs.py
| $ python bin/get_freqs.py
| $ python bin/gather_freqs.py
details: summary: h4 Brown clusters