mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
* Add website to version control
This commit is contained in:
parent
7820c504d7
commit
15f18e5753
67
website/src/jade/404.html
Normal file
67
website/src/jade/404.html
Normal file
|
@ -0,0 +1,67 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>404 | spaCy.io</title>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
|
||||
<meta name="description" content="spaCy is a library for industrial-strength text processing in Python. If you're a small company doing NLP, we want spaCy to seem like a minor miracle.">
|
||||
<meta itemporop="name" content="404 | spaCy.io">
|
||||
<meta itemprop="description" content="spaCy is a library for industrial-strength text processing in Python. If you're a small company doing NLP, we want spaCy to seem like a minor miracle.">
|
||||
<meta itemprop="image" content="http://spacy.io/resources/img/social.png">
|
||||
<meta name="twitter:card" content="summary">
|
||||
<meta name="twitter:site" content="spacy_io">
|
||||
<meta name="twitter:title" content="404 | spaCy.io">
|
||||
<meta name="twitter:description" content="spaCy is a library for industrial-strength text processing in Python. If you're a small company doing NLP, we want spaCy to seem like a minor miracle.">
|
||||
<meta name="twitter:creator" content="@spacy_io">
|
||||
<meta name="twitter:image" content="http://spacy.io/resources/img/social_small.png">
|
||||
<meta property="og:title" content="404 | spaCy.io">
|
||||
<meta property="og:type" content="article">
|
||||
<meta property="og:url" content="http://spacy.io/">
|
||||
<meta property="og:image" content="http://spacy.io/resources/img/social.png">
|
||||
<meta property="og:description" content="spaCy is a library for industrial-strength text processing in Python. If you're a small company doing NLP, we want spaCy to seem like a minor miracle.">
|
||||
<meta property="og:site_name" content="spaCy.io">
|
||||
<meta property="article:published_time">
|
||||
<link rel="stylesheet" href="/resources/css/style.css">
|
||||
<!--[if lt IE 9]><script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]-->
|
||||
</head>
|
||||
<body id="">
|
||||
<header role="banner">
|
||||
<h1 class="logo">spaCy.io</h1>
|
||||
<div class="slogan">
|
||||
</div>
|
||||
</header>
|
||||
<nav role="navigation">
|
||||
<li><a href="/">Home</a></li>
|
||||
<li><a href="/docs">Docs</a></li>
|
||||
<li><a href="/license">License</a></li>
|
||||
<li><a href="/blog">Blog</a></li>
|
||||
</nav>
|
||||
<main id="content">
|
||||
</main>
|
||||
<script src="/resources/js/prism.min.js"></script>
|
||||
<!-- Details polyfill-->
|
||||
<script>
|
||||
var details = document.getElementsByTagName("details");
|
||||
var summary = document.getElementsByTagName("summary");
|
||||
for(var i = 0; i < details.length; i++) {
|
||||
(details[i].getAttribute("open") == null) ? details[i].setAttribute("data-open", "false") : details[i].setAttribute("data-open", "true");
|
||||
}
|
||||
for(var i = 0; i < summary.length; i++) {
|
||||
summary[i].addEventListener( "click", function(e) {
|
||||
var parent = this.parentElement;
|
||||
(parent.getAttribute("data-open") == "false") ? parent.setAttribute("data-open", "true") : parent.setAttribute("data-open", "false");
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<!-- Google analytics-->
|
||||
<script>
|
||||
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
|
||||
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
|
||||
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
|
||||
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
|
||||
ga('create', 'UA-58931649-1', 'auto');
|
||||
ga('send', 'pageview');
|
||||
</script>
|
||||
<footer role="contentinfo"><span class="slogan copyright">© 2015 Syllogism Co. | <a href="mailto:contact@spacy.io">Contact</a></span></footer>
|
||||
</body>
|
||||
</html>
|
13
website/src/jade/404.jade
Normal file
13
website/src/jade/404.jade
Normal file
|
@ -0,0 +1,13 @@
|
|||
include ./header
|
||||
include ./mixins.jade
|
||||
|
||||
- var Page = InitPage(Site, Authors.spacy, "home", '404')
|
||||
- Page.is_error = true
|
||||
- Site.slogan = "404"
|
||||
- Page.active = {}
|
||||
|
||||
+WritePage(Site, Authors['spacy'], Page)
|
||||
h2 Page not found
|
||||
|
||||
section.intro
|
||||
p The page you tried to access was not found. #[a(href="javascript:history.back()") Click here] to go back.
|
116
website/src/jade/blog/_agpl_license.jade
Normal file
116
website/src/jade/blog/_agpl_license.jade
Normal file
|
@ -0,0 +1,116 @@
|
|||
|
||||
//p.
|
||||
// To make spaCy as valuable as possible, licenses to it are for life. You get
|
||||
// complete transparency, certainty and control. If you need to use spaCy
|
||||
// as an API, it's trivial to host it yourself – and you don't need to
|
||||
// worry about the service changing or disappearing. And if you're ever in
|
||||
// acquisition or IPO talks, the story is simple.
|
||||
|
||||
//p.
|
||||
// spaCy can also be used as free open-source software, under the Aferro GPL
|
||||
// license. If you use it this way, you must comply with the AGPL license
|
||||
// terms. When you distribute your project, or offer it as a network service,
|
||||
// you must distribute the source-code and grant users an AGPL license to it.
|
||||
|
||||
|
||||
//h3 Examples
|
||||
|
||||
//p.
|
||||
// In order to clarify how spaCy's license structure might apply to you, I've
|
||||
// written a few examples, in the form of user-stories.
|
||||
|
||||
//details
|
||||
// summary: h4 Seed stage start-ups
|
||||
|
||||
// p.
|
||||
// Ashley and Casey have an idea for a start-up. To explore their idea, they
|
||||
// want to build a minimum viable product they can put in front of potential
|
||||
// users and investors.
|
||||
|
||||
// p. They have two options.
|
||||
|
||||
// ol
|
||||
// li
|
||||
// p.
|
||||
// <strong>Trial commercial license.</strong> With a simple form, they can
|
||||
// use spaCy for 90 days, for a nominal fee of $1. They are free to modify
|
||||
// spaCy, and they will own the copyright to their modifications for the
|
||||
// duration of the license. After the trial period elapses, they can either
|
||||
// pay the license fee, stop using spaCy, release their project under the
|
||||
// AGPL.
|
||||
//
|
||||
// li
|
||||
// p.
|
||||
// <strong>AGPL.</strong> Casey and Pat can instead use spaCy under the AGPL
|
||||
// license. However, they must then release any code that statically or
|
||||
// dynamically links to spaCy under the AGPL as well (e.g. if they import
|
||||
// the module, or import a module that imports it, etc). They also cannot
|
||||
// use spaCy as a network resource, by running it as a service --- this is
|
||||
// the loophole that the "A" part of the AGPL is designed to close.
|
||||
//
|
||||
// p.
|
||||
// Ashley and Casey find the AGPL license unattractive for commercial use.
|
||||
// They decide to take up the trial commercial license. However, over the
|
||||
// next 90 days, Ashley has to move house twice, and Casey gets sick. By
|
||||
// the time the trial expires, they still don't have a demo they can show
|
||||
// investors. They send an email explaining the situation, and a 90 day extension
|
||||
// to their trial license is granted.
|
||||
|
||||
// p.
|
||||
// By the time the extension period has elapsed, spaCy has helped them secure
|
||||
// funding, and they even have a little revenue. They are glad to pay the
|
||||
// $5,000 commercial license fee.
|
||||
|
||||
// p.
|
||||
// spaCy is now permanently licensed for the product Ashley and Casey are
|
||||
// developing. They own the copyright to any modifications they make to spaCy,
|
||||
// but not to the original spaCy code.
|
||||
|
||||
// p.
|
||||
// No additional fees will be due when they hire new developers, run spaCy on
|
||||
// additional internal servers, etc. If their company is acquired, the license
|
||||
// will be transferred to the company acquiring them. However, to use spaCy
|
||||
// in another product, they will have to buy a second license.
|
||||
|
||||
|
||||
// details
|
||||
// summary: h4 University academics
|
||||
|
||||
// p.
|
||||
// Alex and Sasha are post-doctoral researchers working for a university.
|
||||
// Part of their funding comes from a grant from Google, but Google will not
|
||||
// own any part of the work that they produce. Their mission is just to write
|
||||
// papers.
|
||||
|
||||
// p.
|
||||
// Alex and Sasha find spaCy convenient, so they use it in their system under
|
||||
// the AGPL. This means that their system must also be released under the
|
||||
// AGPL, but they're cool with that – they were going to release their
|
||||
// code anyway, as it's the only way to ensure their experiments are properly
|
||||
// repeatable.
|
||||
|
||||
// p.
|
||||
// Alex and Sasha find and fix a few bugs in spaCy. They must release these
|
||||
// modifications, and they ask that they be accepted into the main spaCy repo.
|
||||
// In order to do this, they must sign a contributor agreement, ceding their
|
||||
// copyright. When commercial licenses to spaCy are sold, Alex and Sasha will
|
||||
// not be able to claim any royalties from their contributions.
|
||||
|
||||
// p.
|
||||
// Later, Alex and Sasha implement new features into spaCy, for another paper.
|
||||
// The code was quite rushed, and they don't want to take the time to put
|
||||
// together a proper pull request. They must release their modifications
|
||||
// under the AGPL, but they are not obliged to contribute it to the spaCy
|
||||
// repository, or concede their copyright.
|
||||
|
||||
// details
|
||||
// summary: h4 Open Source developers
|
||||
|
||||
// p.
|
||||
// Phuong and Jessie use the open-source software Calibre to manage their
|
||||
// e-book libraries. They have an idea for a search feature, and they want
|
||||
// to use spaCy to implement it. Calibre is released under the GPLv3. The
|
||||
// AGPL has additional restrictions for projects used as a network resource,
|
||||
// but they don't apply to this project, so Phuong and Jessie can use spaCy
|
||||
// to improve Calibre. They'll have to release their code, but that was
|
||||
// always their intention anyway.
|
97
website/src/jade/blog/dead-code-should-be-buried/index.html
Normal file
97
website/src/jade/blog/dead-code-should-be-buried/index.html
Normal file
|
@ -0,0 +1,97 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>Natural Language Processing Software Badly Needs Some Deprecation Notices | spaCy.io</title>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
|
||||
<meta name="description" content="Imagine: you go to use Google, but before you can search, you first have to select which model you want. Of course, this isn't how Google operates. They just give you the best model. This is what spaCy does, too, because we actually care whether the model you use is good. Most NLP libraries apparently don't.">
|
||||
<meta itemporop="name" content="Natural Language Processing Software Badly Needs Some Deprecation Notices">
|
||||
<meta itemprop="description" content="Imagine: you go to use Google, but before you can search, you first have to select which model you want. Of course, this isn't how Google operates. They just give you the best model. This is what spaCy does, too, because we actually care whether the model you use is good. Most NLP libraries apparently don't.">
|
||||
<meta itemprop="image" content="http://spacy.io/resources/img/social.png">
|
||||
<meta name="twitter:card" content="summary">
|
||||
<meta name="twitter:site" content="spacy_io">
|
||||
<meta name="twitter:title" content="Natural Language Processing Software Badly Needs Some Deprecation Notices">
|
||||
<meta name="twitter:description" content="Imagine: you go to use Google, but before you can search, you first have to select which model you want. Of course, this isn't how Google operates. They just give you the best model. This is what spaCy does, too, because we actually care whether the model you use is good. Most NLP libraries apparently don't.">
|
||||
<meta name="twitter:creator" content="@honnibal">
|
||||
<meta name="twitter:image" content="http://spacy.io/resources/img/social_small.png">
|
||||
<meta property="og:title" content="Natural Language Processing Software Badly Needs Some Deprecation Notices">
|
||||
<meta property="og:type" content="article">
|
||||
<meta property="og:url" content="http://spacy.io/blog/introducing-spacy">
|
||||
<meta property="og:image" content="http://spacy.io/resources/img/social.png">
|
||||
<meta property="og:description" content="Imagine: you go to use Google, but before you can search, you first have to select which model you want. Of course, this isn't how Google operates. They just give you the best model. This is what spaCy does, too, because we actually care whether the model you use is good. Most NLP libraries apparently don't.">
|
||||
<meta property="og:site_name" content="spaCy.io">
|
||||
<meta property="article:published_time" content="2015-02-19T00:00:00.000Z">
|
||||
<link rel="stylesheet" href="/resources/css/style.css">
|
||||
<!--[if lt IE 9]><script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]-->
|
||||
</head>
|
||||
<body id="blog">
|
||||
<header role="banner">
|
||||
<h1 class="logo">spaCy.io</h1>
|
||||
<div class="slogan">Blog
|
||||
</div>
|
||||
</header>
|
||||
<nav role="navigation">
|
||||
<li><a href="/">Home</a></li>
|
||||
<li><a href="/docs">Docs</a></li>
|
||||
<li><a href="/displacy" target="_blank">Demo</a></li>
|
||||
<li><a href="/license">License</a></li>
|
||||
<li class="active"><a href="/blog">Blog</a></li>
|
||||
</nav>
|
||||
<main id="content">
|
||||
<article class="post">
|
||||
<header>
|
||||
<h2>
|
||||
<strike></strike>
|
||||
</h2>
|
||||
<h2>Natural Language Processing Software Badly Needs Some Deprecation Notices</h2>
|
||||
<div class="subhead">by <a href="//twitter.com/honnibal" rel="author" target="_blank">Matthew Honnibal</a> on
|
||||
<time>February 19, 2015</time>
|
||||
</div>
|
||||
</header>
|
||||
<p>Imagine: you try to use Google Translate, but it asks you to first select which model you want. The new, awesome deep-learning model is there, but so are lots of others. You pick one that sounds fancy, but it turns out it's a 20-year old experimental model trained on a corpus of oven manuals. You are not interested in over manuals.</p>
|
||||
<p>Of course, this is not how Google Translate operates. They make sure the model you use is good. This is what spaCy does, too. But most natural language understanding libraries, it's just not anybody's job to delete obsolete models. There's also a real reluctance to editorialize. Some advantage can be found for every model. Like, is it really fair to call that oven-specific model obsolete? In some ways we still have a lot to learn from its principled approach. And what if someone needs to translate an oven manual?</p>
|
||||
<p>Have a look through the <a href="http://gate.ac.uk/sale/tao/split.html">GATE software</a>. There's a lot there, developed over 12 years and many person-hours. But there's approximately zero curation. The philosophy is just to provide things. It's up to you to decide what to use.</p>
|
||||
<p>This is bad. It's bad to provide an implementation of <a href="https://gate.ac.uk/sale/tao/splitch18.html">MiniPar</a>, and have it just...sit there, with no hint that it's 20 years old and should not be used. The RASP parser, too. Why are these provided? Worse, why is there no warning? Unless you want to investigate the history of the field, there's no reason to execute these programs in 2015.</p>
|
||||
<p><a href="http://webdocs.cs.ualberta.ca/~lindek/minipar.htm">Check out</a> how <a href="http://research.google.com/pubs/author108.html">Dekang Lin</a>, the author of Minipar, presents the software – with reference to a benchmark on a Pentium II. This is the right way to archive the program. In this form its status is clear.</p>
|
||||
<p>Various people have asked me why I decided to make a new Python NLP library, <a href="http://spacy.io">spaCy</a>, instead of supporting the <a href="http://nltk.org">NLTK</a> project. There are many things I dislike about the NLTK code-base, but the lack of curation is really my key complaint: the project simply doesn't throw anything away, and it refuses to call any technique or implementation good or bad. </p>
|
||||
<p>In March NLTK announced the inclusion of a more up-to-date dependency parsing algorithm, based on the linear-time algorithm everyone is now using. There was some excitement about this, as this type of parser really should get much better accuracy than the other algorithms NLTK includes. But can you tell <a href="http://www.nltk.org/py-modindex.html">which of these parsers is the new one?</a></p>
|
||||
<p>The best parser there – the new one – is called "transition parser". But it's still not actually good. Unfortunately, the NLTK implementation is based on Nivre's original 2003 paper, instead of using the recent research; and they use external, general-purpose machine learning libraries, instead of a simple custom implementation that would perform much better. Together these limitations mean the performance of the model is terrible, relative to the current state-of-the-art.</p>
|
||||
<p>I happened to visit the NLTK issue tracker while they were discussing the transition-based parser, so I linked them to my post explaining how to implement this parser in 500 lines of Python. I got a "thanks but no thanks", and <a href="https://github.com/nltk/nltk/issues/694">the issue was abruptly closed</a>. Another researcher's offer from 2012 to implement this type of model also went <a href="http://arxiv.org/pdf/1409.7386v1.pdf">unanswered</a>.</p>
|
||||
<p>An enormous amount of work has gone into, and is still going into, making NLTK an easily accessible way for computer science students to learn a little bit about linguistics, or for linguistics students to learn a little bit about computer science. I respect that work.</p>
|
||||
<p>But nowhere does it say that if you want to really build something, or do up-to-date research, NLTK isn't for you. NLTK claims it can serve that use-case. But it can't. The implication is that if you use the models provided in NLTK, e.g. its chunker, tagger, dependency parser etc, these will be roughly equivalent to what you'll get elsewhere. But they're not. The gulf in quality is enormous. <a href="https://github.com/nltk/nltk/issues/1063">NLTK does not even know how its POS tagger was trained</a>. The model is just this .pickle file that's been passed around for 5 years, its origins lost to time. This is not okay. </p>
|
||||
<p>I think open source software should be very careful to make its limitations clear. It's a disservice to provide something that's much less useful than you imply. It's like offering your friend a lift and then not showing up. It's totally fine to not do something – so long as you never suggested you were going to do it. There are ways to do worse than nothing. </p>
|
||||
<footer role="contentinfo" class="meta"><a href="http://twitter.com/share?text=Natural Language Processing Software Badly Needs Some Deprecation Notices&url=http://spacy.io/blog/introducing-spacy&via=spacy_io" title="Share on Twitter" target="_blank" class="button button-twitter">Share on Twitter </a>
|
||||
<div class="discuss"> <a target="_blank" href="https://www.reddit.com/r/programming/comments/2tlyrr/spacy_industrialstrength_nlp_with_pythoncython" title="Discuss on Reddit" class="button button-reddit">Reddit Thread</a> <a target="_blank" href="https://news.ycombinator.com/item?id=8942783" title="Discuss on Hacker News Thread" class="button button-hn">Hacker News</a>
|
||||
</div>
|
||||
<section class="intro profile">
|
||||
<p><img src="/resources/img/matt.png"> Matthew Honnibal is the author of the <a href="http://spacy.io">spaCy</a> software and the sole founder of its parent company. He studied linguistics as an undergrad, and never thought he'd be a programmer. By 2009 he had a PhD in computer science, and in 2014 he left academia to found Syllogism Co. He's from Sydney and lives in Berlin. <span class="social"><a href="//twitter.com/honnibal" target="_blank">Twitter</a></span></p>
|
||||
</section>
|
||||
</footer>
|
||||
</article>
|
||||
</main>
|
||||
<script src="/resources/js/prism.min.js"></script>
|
||||
<!-- Details polyfill-->
|
||||
<script>
|
||||
var details = document.getElementsByTagName("details");
|
||||
var summary = document.getElementsByTagName("summary");
|
||||
for(var i = 0; i < details.length; i++) {
|
||||
(details[i].getAttribute("open") == null) ? details[i].setAttribute("data-open", "false") : details[i].setAttribute("data-open", "true");
|
||||
}
|
||||
for(var i = 0; i < summary.length; i++) {
|
||||
summary[i].addEventListener( "click", function(e) {
|
||||
var parent = this.parentElement;
|
||||
(parent.getAttribute("data-open") == "false") ? parent.setAttribute("data-open", "true") : parent.setAttribute("data-open", "false");
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<script>
|
||||
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
|
||||
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
|
||||
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
|
||||
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
|
||||
ga('create', 'UA-58931649-1', 'auto');
|
||||
ga('send', 'pageview');
|
||||
</script>
|
||||
<footer role="contentinfo"><span class="slogan copyright">© 2015 Syllogism Co. | <a href="mailto:contact@spacy.io">Contact</a></span></footer>
|
||||
</body>
|
||||
</html>
|
35
website/src/jade/blog/dead-code-should-be-buried/index.jade
Normal file
35
website/src/jade/blog/dead-code-should-be-buried/index.jade
Normal file
|
@ -0,0 +1,35 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
|
||||
+WritePost(Meta)
|
||||
section.intro
|
||||
p Natural Language Processing moves fast, so maintaining a good library means constantly throwing things away. Most libraries are failing badly at this, as academics hate to editorialize. This post explains the problem, why it's so damaging, and why I wrote #[a(href="http://spacy.io") spaCy] to do things differently.
|
||||
|
||||
p Imagine: you try to use Google Translate, but it asks you to first select which model you want. The new, awesome deep-learning model is there, but so are lots of others. You pick one that sounds fancy, but it turns out it's a 20-year old experimental model trained on a corpus of oven manuals. When it performs little better than chance, you can't even tell from its output. Of course, Google Translate would not do this to you. But most Natural Language Processing libraries do, and it's terrible.
|
||||
|
||||
p Natural Language Processing (NLP) research moves very quickly. The new models supercede the old ones. And yet most NLP libraries are loathe to ever throw anything away. The ones that have been around a long time then start to look very large and impressive. But big is not beautiful here. It is not a virtue to present users with a dozen bad options.
|
||||
|
||||
p Have a look through the #[a(href="http://gate.ac.uk/sale/tao/split.html") GATE software]. There's a lot there, developed over 12 years and many person-hours. But there's approximately zero curation. The philosophy is just to provide things. It's up to you to decide what to use.
|
||||
|
||||
p This is bad. It's bad to provide an implementation of #[a(href="https://gate.ac.uk/sale/tao/splitch18.html") MiniPar], and have it just...sit there, with no hint that it's 20 years old and should not be used. The RASP parser, too. Why are these provided? Worse, why is there no warning? The #[a(href="http://webdocs.cs.ualberta.ca/~lindek/minipar.htm") Minipar homepage] puts the software in the right context:
|
||||
|
||||
blockquote
|
||||
p MINIPAR is a broad-coverage parser for the English language. An evaluation with the SUSANNE corpus shows that MINIPAR achieves about 88% precision and 80% recall with respect to dependency relationships. MINIPAR is very efficient, #[strong on a Pentium II 300 with 128MB memory], it parses about 300 words per second.
|
||||
|
||||
p Ideally there would be a date, but it's still obvious that this isn't software anyone should be executing in 2015, unless they're investigating the history of the field.
|
||||
|
||||
p A less extreme example is #[a(href="http://nlp.stanford.edu/software/corenlp.shtml") CoreNLP]. They offer a range of models with complicated speed/accuracy/loading time trade-offs, many with subtly different output. Mostly no model is strictly dominated by another, so there's some case for offering all these options. But to my taste there's still far too much there, and the recommendation of what to use is far from clear.
|
||||
|
||||
h3 Why I didn't contribute to NLTK
|
||||
|
||||
p Various people have asked me why I decided to make a new Python NLP library, #[a(href="http://spacy.io") spaCy], instead of supporting the #[a(href="http://nltk.org") NLTK] project. This is the main reason. You can't contribute to a project if you believe that the first thing that they should do is throw almost all of it away. You should just make your own project, which is what I did.
|
||||
p Have a look through #[a(href="http://www.nltk.org/py-modindex.html") the module list of NLTK]. It looks like there's a lot there, but there's not. What NLTK has is a decent tokenizer, some passable stemmers, a good implementation of the Punkt sentence boundary detector (after #[a(href="http://joelnothman.com/") Joel Nothman] rewrote it), some visualization tools, and some wrappers for other libraries. Nothing else is of any use.
|
||||
|
||||
p For instance, consider #[code nltk.parse]. You might think that amongst all this code there was something that could actually predict the syntactic structure of a sentence for you, but you would be wrong. There are wrappers for the BLLIP and Stanford parsers, and since March there's been an implementation of Nivre's 2003 transition-based dependency parser. Unfortunately no model is provided for it, as they rely on an external wrapper of an external learner, which is unsuitable for the structure of their problem. So the implementation is too slow to be actually useable.
|
||||
|
||||
p This problem is totally avoidable, if you just sit down and write good code, instead of stitching together external dependencies. I pointed NLTK to my tutorial describing #[a(href="http://spacy.io/blog/parsing-english-in-python/") how to implement a modern dependency parser], which includes a BSD-licensed implementation in 500 lines of Python. I was told "thanks but no thanks", and #[a(href="https://github.com/nltk/nltk/issues/694") the issue was abruptly closed]. Another researcher's offer from 2012 to implement this type of model also went #[a(href="http://arxiv.org/pdf/1409.7386v1.pdf") unanswered].
|
||||
|
||||
p The story in #[code nltk.tag] is similar. There are plenty of wrappers, for the external libraries that have actual taggers. The only actual tagger model they distribute is #[a(href="http://spacy.io/blog/part-of-speech-POS-tagger-in-python/") terrible]. Now it seems that #[a(href="https://github.com/nltk/nltk/issues/1063") NLTK does not even know how its POS tagger was trained]. The model is just this .pickle file that's been passed around for 5 years, its origins lost to time. It's not okay to offer this to people, to recommend they use it.
|
||||
|
||||
p I think open source software should be very careful to make its limitations clear. It's a disservice to provide something that's much less useful than you imply. It's like offering your friend a lift and then not showing up. It's totally fine to not do something – so long as you never suggested you were going to do it. There are ways to do worse than nothing.
|
16
website/src/jade/blog/dead-code-should-be-buried/meta.jade
Normal file
16
website/src/jade/blog/dead-code-should-be-buried/meta.jade
Normal file
|
@ -0,0 +1,16 @@
|
|||
- var Meta = {}
|
||||
- Meta.author_id = "matt"
|
||||
- Meta.headline = "Dead Code Should be Buried"
|
||||
- Meta.description = "Natural Language Processing moves fast, so maintaining a good library means constantly throwing things away. Most libraries are failing badly at this, as academics hate to editorialize. This post explains the problem, why it's so damaging, and why I wrote spaCy to do things differently."
|
||||
- Meta.date = "2015-09-04"
|
||||
- Meta.url = "/blog/dead-code-should-be-buried"
|
||||
- Meta.links = [{}, {}]
|
||||
- Meta.links[0].id = 'hn'
|
||||
- Meta.links[0].name = "Hacker News Thread"
|
||||
- Meta.links[0].title = 'Hacker News'
|
||||
- Meta.links[0].url = "http://news.ycombinator.com/item?id=10173669"
|
||||
- Meta.links[1].id = 'reddit'
|
||||
- Meta.links[1].name = "Reddit"
|
||||
- Meta.links[1].title = '/r/programming'
|
||||
- Meta.links[1].url = "https://www.reddit.com/r/programming/comments/3jmgck/dead_code_should_be_buried_why_i_wrote_spacy/"
|
||||
|
50
website/src/jade/blog/displacy/index.jade
Normal file
50
website/src/jade/blog/displacy/index.jade
Normal file
|
@ -0,0 +1,50 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
mixin Displacy(sentence, caption_text, height)
|
||||
- var url = "/displacy/?full=" + sentence.replace(" ", "%20")
|
||||
|
||||
.displacy
|
||||
iframe.displacy(src="/resources/displacy/robots.html" height=height)
|
||||
|
||||
a.view-displacy(href=url, target="_blank")
|
||||
| Interactive Visualizer
|
||||
|
||||
p.caption.
|
||||
#{caption_text}
|
||||
|
||||
|
||||
|
||||
+WritePost(Meta)
|
||||
p #[img.title(src="/resources/img/displacy_screenshot.jpg" alt="Screenshot of syntactic dependency parse")]
|
||||
|
||||
p A syntactic dependency parse is a kind of shallow meaning representation. It's an important piece of many language understanding and text processing technologies. Now that these representations can be computed quickly, and with increasingly high accuracy, they're being used in lots of applications – translation, sentiment analysis, and summarization are major application areas.
|
||||
|
||||
p I've been living and breathing similar representations for most of my career. But there's always been a problem: talking about these things is tough. Most people haven't thought much about grammatical structure, and the idea of them is inherently abstract. When I left academia to write #[a(href="http://spaCy.io") spaCy], I knew I wanted a good visualizer. Unfortunately, I also knew I'd never be the one to write it. I'm deeply graphically challenged. Fortunately, when working with #[a(href="http://ines.io") Ines] to build this site, she really nailed the problem, with a solution I'd never have thought of. I really love the result, which we're calling #[a(href="/displacy") displaCy]:
|
||||
|
||||
+Displacy("Robots in popular culture are there to remind us of the awesomeness of unbounded human agency", "Click the button to full-screen and interact, or scroll to see the full parse.", 325)
|
||||
|
||||
p The #[a(href="https://code.google.com/p/whatswrong/") best alternative] is a Java command-line tool that outputs static images, which look like this:
|
||||
|
||||
p
|
||||
img(src="/resources/img/ark_example.jpg" alt="Output of the Brat parse tree visualizer")
|
||||
|
||||
p I find the output of the CMU visualizer basically unreadable. Pretty much all visualizers suffer from this problem: they don't add enough space. I always thought this was a hard problem, and a good Javascript visualizer would need to do something crazy with Canvas. Ines quickly proposed a much better solution, based on native, web-standard technologies.
|
||||
|
||||
p The idea is to use CSS to draw shapes, mostly with border styling, and some arithmetic to figure out the spacing:
|
||||
|
||||
blockquote
|
||||
p The arrow needs only one HTML element, #[code <div class="arrow">] and the CSS pseudo-elements #[code :before] and #[code :after]. The #[code :before] pseudo-element is used for the arc and is essentially a circle (#[code border-radius: 50%]) with a black outline. Since its parent #[code .arrow] is only half its height and set to #[code overflow: hidden], it’s "cut in half" and ends up looking like a half circle.
|
||||
|
||||
footer: cite #[a(href="http://ines.io/blog/developing-displacy") Ines Montani, #[em Developing Displacy]]
|
||||
|
||||
p To me, this seemed like witchcraft, or a hack at best. But I was quickly won over: if all we do is declare the data and the relationships, in standards-compliant HTML and CSS, then we can simply step back and let the browser do its job. We know the code will be small, the layout will work on a variety of display, and we'll have a ready separation of style and content. For long output, we simply let the graphic overflow, and let users scroll.
|
||||
|
||||
p What I'm particularly excited about is the potential for displaCy as an #[a(href="http://spacy.io/displacy/?manual=Robots%20in%20popular%20culture%20are%20there%20to%20remind%20us%20of%20the%20awesomeness%20of%20unbounded%20human%20agency" target="_blank") annotation tool]. It may seem unintuitive at first, but I think it will be much better to annotate texts the way the parser operates, with a small set of actions and a stack, than by selecting arcs directly. Why? A few reasons:
|
||||
|
||||
ul
|
||||
li You're always asked a question. You don't have to decide-what-to-decide.
|
||||
li The viewport can scroll with the user, making it easier to work with spacious, readable designs.
|
||||
li With only 4-6 different actions, it's easy to have key-based input.
|
||||
|
||||
p Efficient manual annotation is incredibly important. If we can get that right, then we can offer you cheap domain adaptation. You give us some text, we get it annotated, and ship you a custom model, that's much more accurate on your data. If you're interested in helping us beta test this idea, #[a(href="mailto:contact@spacy.io") get in touch].
|
12
website/src/jade/blog/displacy/meta.jade
Normal file
12
website/src/jade/blog/displacy/meta.jade
Normal file
|
@ -0,0 +1,12 @@
|
|||
- var Meta = {}
|
||||
- Meta.author_id = "matt"
|
||||
- Meta.headline = "Displaying linguistic structure with CSS"
|
||||
- Meta.description = "One of the features of the relaunch I'm most excited about is the displaCy visualizer and annotation tool. This solves two problems I've thought about a lot: first, how can I help people understand what information spaCy gives them access to? Without a good visualization, the ideas are very abstract. Second, how can we make dependency trees easy for humans to create?"
|
||||
- Meta.date = "2015-08-19"
|
||||
- Meta.url = "/blog/displacy"
|
||||
- Meta.links = [{}]
|
||||
- Meta.links[0].id = 'reddit'
|
||||
- Meta.links[0].name = 'Reddit'
|
||||
- Meta.links[0].title = 'Discuss on Reddit'
|
||||
- Meta.links[0].url = "https://www.reddit.com/r/programming/comments/3hoj0b/displaying_linguistic_structure_with_css/"
|
||||
- Meta.image = "http://spacy.io/resources/img/displacy_screenshot.jpg"
|
|
@ -0,0 +1,37 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
+WritePost(Meta)
|
||||
section.intro
|
||||
|
||||
p (As told with the #[a(href="http://splasho.com/upgoer5/") ten hundred most common words] that I speak.)
|
||||
|
||||
p When I was little, my favorite TV shows all had talking computers. Now I'm big and there are still no talking computers. At least, not really talking. We can make them, like, say things — but I want them to tell us things. And I want them to listen, and to read. Why is this so hard?
|
||||
|
||||
p It turns out that almost anything we say could mean many many different things, but we don't notice because almost all of those meanings would be weird or stupid or just not possible. If I say:
|
||||
|
||||
p.example #[a(href="http://spacy.io/displacy/?full=I%20saw%20a%20movie%20in%20a%20dress" target="_blank") I saw a movie in a dress]
|
||||
|
||||
p Would you ever ask me,
|
||||
|
||||
p.example “Were you in the dress, or was the movie in the dress?”
|
||||
|
||||
p It's weird to even think of that. But a computer just might, because there are other cases like:
|
||||
|
||||
p.example #[a(href="http://spacy.io/displacy/?full=The%20TV%20showed%20a%20girl%20in%20a%20dress" target="_blank") The TV showed a girl in a dress]
|
||||
|
||||
p Where the words hang together in the other way. People used to think that the answer was to tell the computer lots and lots of facts. But then you wake up one day and you're writing facts like #[em movies do not wear dresses], and you wonder where it all went wrong. Actually it's even worse than that. Not only are there too many facts, most of them are not even really facts! #[a(href="https://en.wikipedia.org/wiki/Cyc") People really tried this]. We've found that the world is made up of #[em if]s and #[em but]s.
|
||||
|
||||
p These days we just show the computer lots and lots and lots of words. We gave up trying to get it to understand what a “dress” is. We let #[em dress] be just some letters. But if it is seen it around #[em girl] enough times (which is just some other letters, which are seen around some #[strong other] other letters), it can make good guesses.
|
||||
|
||||
p It doesn't always guess right, but we can tell how often it does, and we can think of ways t help it learn better. We have a number, and we can slowly make it bigger, a little bit by a little bit.
|
||||
|
||||
p (One thing I've learned is, people are great at making a number bigger, if you pay a lot of them to try. The key is to pick numbers where, if they make the number bigger, they can't help but have done something actually good. This is harder than it sounds. Some say no numbers are like this. I ask them to show me much good being done another way, but they never can.)
|
||||
|
||||
blockquote.pull-quote
|
||||
+TweetThis("Instead of telling the computer facts, what we needed to do was tell it how to learn.", Meta.url)
|
||||
|
||||
p The ideas we come up with for getting the computer to talk, listen or read a little better can be used to get it to see or plan a little better, and the other way around. Once we stopped telling it things like “#[em movies do not wear dresses]”, things really took off.
|
||||
|
||||
p Each bit of work still only makes our numbers a little bit bigger, and the bigger the numbers go, the harder they are to raise. But that is a good problem to have. Now that computers can read quite well, I think we should be able to do pretty great things. What should we get them to read?
|
||||
|
15
website/src/jade/blog/eli5-computers-learn-reading/meta.jade
Normal file
15
website/src/jade/blog/eli5-computers-learn-reading/meta.jade
Normal file
|
@ -0,0 +1,15 @@
|
|||
- var Meta = {}
|
||||
- Meta.author_id = "matt"
|
||||
- Meta.headline = "Statistical NLP in Basic English"
|
||||
- Meta.description = "When I was little, my favorite TV shows all had talking computers. Now I’m big and there are still no talking computers, so I’m trying to make some myself. Well, we can make computers say things. But when we say things back, they don’t really understand. Why not?"
|
||||
- Meta.date = "2015-08-24"
|
||||
- Meta.url = "/blog/eli5-computers-learn-reading/"
|
||||
- Meta.links = []
|
||||
//- Meta.links[0].id = 'reddit'
|
||||
//- Meta.links[0].name = "Reddit"
|
||||
//- Meta.links[0].title = 'Reddit Thread'
|
||||
//- Meta.links[0].url = "https://www.reddit.com/r/technology/comments/3i8utl/computers_are_learning_to_read_because_we_show/"
|
||||
//- Meta.links[1].id = 'hn'
|
||||
//- Meta.links[1].name = "Hacker News Thread"
|
||||
//- Meta.links[1].title = 'Hacker News'
|
||||
//- Meta.links[1].url = "https://news.ycombinator.com/item?id=7658864"
|
154
website/src/jade/blog/how-spacy-works/index.jade
Normal file
154
website/src/jade/blog/how-spacy-works/index.jade
Normal file
|
@ -0,0 +1,154 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
+WritePost(Meta)
|
||||
|
||||
p The following are some hasty preliminary notes on how spaCy works. The short story is, there are no new killer algorithms. The way that the tokenizer works is novel and a bit neat, and the parser has a new feature set, but otherwise the key algorithms are well known in the recent literature.
|
||||
|
||||
p Some might also wonder how I get Python code to run so fast. I don't – spaCy is written in #[a(href="http://cython.org") Cython], an optionally statically-typed language that compiles to C or C++, which is then loaded as a C extension module. This makes it #[a(href="/blog/writing-c-in-cython") easy] to achieve the performance of native C code, but allows the use of Python language features, via the Python C API. The Python unicode library was particularly useful to me. I think it would have been much more difficult to write spaCy in another language.
|
||||
|
||||
|
||||
h3 Tokenizer and Lexicon
|
||||
|
||||
p Tokenization is the task of splitting a string into meaningful pieces, called tokens, which you can then compute with. In practice, the task is usually to match the tokenization performed in some treebank, or other corpus. If we want to apply a tagger, entity recogniser, parser etc, then we want our run-time text to match the training conventions. If we want to use a model that's been trained to expect "isn't" to be split into two tokens, ["is", "n't"], then that's how we need to prepare our data.
|
||||
|
||||
p In order to train spaCy's models with the best data available, I therefore tokenize English according to the Penn Treebank scheme. It's not perfect, but it's what everybody is using, and it's good enough.
|
||||
|
||||
h3 What we don't do
|
||||
|
||||
p The Penn Treebank was distributed with a script called tokenizer.sed, which tokenizes ASCII newswire text roughly according to the Penn Treebank standard. Almost all tokenizers are based on these regular expressions, with various updates to account for unicode characters, and the fact that it's no longer 1986 – today's text has URLs, emails, emoji, etc.
|
||||
|
||||
p Usually, the resulting regular expressions are applied in multiple passes, which is quite inefficient. Often no care is taken to preserve indices into the original string. If you lose these indices, it'll be difficult to calculate mark-up based on your annotations.
|
||||
|
||||
h3 Tokenizer Algorithm
|
||||
|
||||
p spaCy's tokenizer assumes that no tokens will cross whitespace – there will be no multi-word tokens. If we want these, we can post-process the token-stream later, merging as necessary. This assumption allows us to deal only with small chunks of text. We can cache the processing of these, and simplify our expressions somewhat.
|
||||
|
||||
p Here is what the outer-loop would look like in Python. (You can see the production implementation, in Cython, #[a(href="https://github.com/honnibal/spaCy/blob/master/spacy/tokenizer.pyx#L56") here].)
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| cache = {}
|
||||
| def tokenize(text):
|
||||
| tokens = []
|
||||
| for substring in text.split(' '):
|
||||
| if substring in cache:
|
||||
| tokens.extend(cache[substring])
|
||||
| else:
|
||||
| subtokens = _tokenize_substring(substring)
|
||||
| tokens.extend(subtokens)
|
||||
| cache[substring] = subtokens
|
||||
| return tokens
|
||||
|
||||
p The actual work is performed in #[code _tokenize_substring]. For this, I divide the tokenization rules into three pieces:
|
||||
|
||||
ul
|
||||
li A prefixes expression, which matches from the start of the string;
|
||||
li A suffixes expression, which matches from the end of the string;
|
||||
li A special-cases table, which matches the whole string.
|
||||
|
||||
p The algorithm then proceeds roughly like this (consider this like pseudo-code; this was written quickly and has not been executed):
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| # Tokens which can be attached at the beginning or end of another
|
||||
| prefix_re = _make_re([",", '"', '(', ...])
|
||||
| suffix_re = _make_re(s[",", "'", ":", "'s", ...])
|
||||
|
||||
| # Contractions etc are simply enumerated, since they're a finite set. We
|
||||
| # can also specify anything we like here, which is nice --- different data
|
||||
| # has different quirks, so we want to be able to add ad hoc exceptions.
|
||||
| special_cases = {
|
||||
| "can't": ("ca", "n't"),
|
||||
| "won't": ("wo", "n't"),
|
||||
| "he'd've": ("he", "'d", "'ve"),
|
||||
| ...
|
||||
| ":)": (":)",) # We can add any arbitrary thing to this list.
|
||||
| }
|
||||
|
||||
| def _tokenize_substring(substring):
|
||||
| prefixes = []
|
||||
| suffixes = []
|
||||
| while substring not in special_cases:
|
||||
| prefix, substring = _apply_re(substring, prefix_re)
|
||||
| if prefix:
|
||||
| prefixes.append(prefix)
|
||||
| else:
|
||||
| suffix, substring = _apply_re(substring, suffix_re)
|
||||
| if suffix:
|
||||
| suffixes.append(suffix)
|
||||
| else:
|
||||
| break
|
||||
|
||||
p This procedure splits off tokens from the start and end of the string, at each point checking whether the remaining string is in our special-cases table. If it is, we stop splitting, and return the tokenization at that point.
|
||||
|
||||
p The advantage of this design is that the prefixes, suffixes and special-cases can be declared separately, in easy-to-understand files. If a new entry is added to the special-cases, you can be sure that it won't have some unforeseen consequence to a complicated regular-expression grammar.
|
||||
|
||||
h3 Coupling the Tokenizer and Lexicon
|
||||
|
||||
p As mentioned above, the tokenizer is designed to support easy caching. If all we were caching were the matched substrings, this would not be so advantageous. Instead, what we do is create a struct which houses all of our lexical features, and cache *that*. The tokens are then simply pointers to these rich lexical types.
|
||||
|
||||
p In a sample of text, vocabulary size grows exponentially slower than word count. So any computations we can perform over the vocabulary and apply to the word count are efficient.
|
||||
|
||||
h3 Part-of-speech Tagger
|
||||
|
||||
p In 2013, I wrote a blog post describing #[a(href="/blog/part-of-speech-POS-tagger-in-python/") how to write a good part of speech tagger]. My recommendation then was to use greedy decoding with the averaged perceptron. I think this is still the best approach, so it's what I implemented in spaCy.
|
||||
|
||||
p The tutorial also recommends the use of Brown cluster features, and case normalization features, as these make the model more robust and domain independent. spaCy's tagger makes heavy use of these features.
|
||||
|
||||
h3 Dependency Parser
|
||||
|
||||
p The parser uses the algorithm described in my #[a(href="parsing-english-in-python/") 2014 blog post]. This algorithm, shift-reduce dependency parsing, is becoming widely adopted due to its compelling speed/accuracy trade-off.
|
||||
|
||||
p Some quick details about spaCy's take on this, for those who happen to know these models well. I'll write up a better description shortly.
|
||||
|
||||
ol
|
||||
li I use greedy decoding, not beam search;
|
||||
li I use the arc-eager transition system;
|
||||
li I use the Goldberg and Nivre (2012) dynamic oracle.
|
||||
li I use the non-monotonic update from my CoNLL 2013 paper (Honnibal, Goldberg and Johnson 2013).
|
||||
|
||||
p So far, this is exactly the configuration from the CoNLL 2013 paper, which scored 91.0. So how have I gotten it to 92.4? The following tweaks:
|
||||
|
||||
ol
|
||||
li I use Brown cluster features – these help a lot;
|
||||
li I redesigned the feature set. I've long known that the Zhang and Nivre (2011) feature set was suboptimal, but a few features don't make a very compelling publication. Still, they're important.
|
||||
li When I do the dynamic oracle training, I also make the upate cost-sensitive: if the oracle determines that the move the parser took has a cost of N, then the weights for the gold class are incremented by +N, and the weights for the predicted class are incremented by -N. This only made a small (0.1-0.2%) difference.
|
||||
|
||||
h3 Implementation
|
||||
|
||||
p I don't do anything algorithmically novel to improve the efficiency of the parser. However, I was very careful in the implementation.
|
||||
|
||||
p A greedy shift-reduce parser with a linear model boils down to the following loop:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def parse(words, model, feature_funcs, n_classes):
|
||||
| state = init_state(words)
|
||||
| for _ in range(len(words) * 2):
|
||||
| features = [templ(state) for templ in feature_funcs]
|
||||
| scores = [0 for _ in range(n_classes)]
|
||||
| for feat in features:
|
||||
| weights = model[feat]
|
||||
| for i, weight in enumerate(weights):
|
||||
| scores[i] += weight
|
||||
| class_, score = max(enumerate(scores), key=lambda item: item[1])
|
||||
| transition(state, class_)
|
||||
|
||||
p The parser makes 2N transitions for a sentence of length N. In order to select the transition, it extracts a vector of K features from the state. Each feature is used as a key into a hash table managed by the model. The features map to a vector of weights, of length C. We then dot product the feature weights to the scores vector we are building for that instance.
|
||||
|
||||
p The inner-most loop here is not so bad: we only have a few dozen classes, so pit's just a short dot product. Both of the vectors are in the cache, so this pis a snack to a modern CPU.
|
||||
|
||||
p The bottle-neck in this algorithm is the 2NK look-ups into the hash-table that we must make, as these almost always have to hit main memory. The feature-set is enormously large, because all of our features are one-hot boolean indicators. Some of the features will be common, so they'll lurk around in the CPU's cache hierarchy. But a lot of them won't be, and accessing main memory takes a lot of cycles.
|
||||
|
||||
p
|
||||
| I used to use the Google dense_hash_map implementation. This seemed a solid choice: it came from a big brand, it was in C++, and it seemed very complicated. Later, I read
|
||||
a(href="http://preshing.com/20130107/this-hash-table-is-faster-than-a-judy-array/")
|
||||
Jeff Preshing's excellent post
|
||||
| on open-addressing with linear probing. This really spoke to me. I had assumed that a fast hash table implementation would necessarily be very complicated, but no – this is another situation where the simple strategy wins.
|
||||
|
||||
p I've packaged my Cython implementation separately from spaCy, in the package #[a(href="https://github.com/syllog1sm/preshed") preshed] – for "pre-hashed", but also as a nod to Preshing. I've also taken great care over the feature extraction and perceptron code, which I'm distributing in a package named #[a(href="https://github.com/honnibal/thinc") thinc] (since it's for learning very sparse models with Cython).
|
||||
|
||||
p By the way: from comparing notes with a few people, it seems common to implement linear models in a way that's suboptimal for multi-class classification. The mistake is to store in the hash-table one weight per (feature, class) pair, rather than mapping the feature to a vector of weights, for all of the classes. This is bad because it means you need to hit the table C times, one per class, as you always need to evaluate a feature against all of the classes. In the case of the parser, this means the hash table is accessed 2NKC times, instead of the 2NK times if you have a weights vector. You should also be careful to store the weights contiguously in memory – you don't want a linked list here. I use a block-sparse format, because my problems tend to have a few dozen classes.
|
||||
|
||||
p I guess if I had to summarize my experience, I'd say that the efficiency of these models is really all about the data structures. We want to stay small, and stay contiguous. Minimize redundancy and minimize pointer chasing. That's why Cython is so well suited to this: we get to lay out our data structures, and manage the memory ourselves, with full C-level control.
|
7
website/src/jade/blog/how-spacy-works/meta.jade
Normal file
7
website/src/jade/blog/how-spacy-works/meta.jade
Normal file
|
@ -0,0 +1,7 @@
|
|||
- var Meta = {}
|
||||
- Meta.author_id = "matt"
|
||||
- Meta.headline = "How spaCy works"
|
||||
- Meta.description = "This post is a work in progress, explaining some of how spaCy is designed and implemented, and noting which algorithms were used. spaCy is built on science, not alchemy, and when new discoveries are made, we publish them. We want to stay on the same page as the academic community, to use their work. Still, explaining everything takes time — so this post isn't yet as complete as we'd like it to be. Stay tuned."
|
||||
- Meta.date = "2015-02-19"
|
||||
- Meta.url = "/blog/how-spacy-works"
|
||||
- Meta.links = []
|
55
website/src/jade/blog/index.jade
Normal file
55
website/src/jade/blog/index.jade
Normal file
|
@ -0,0 +1,55 @@
|
|||
mixin WriteTeaser(Authors, post_title)
|
||||
if post_title == "parsing-english-in-python"
|
||||
include ./parsing-english-in-python/meta.jade
|
||||
else if post_title == "introducing-spacy"
|
||||
include ./introducing-spacy/meta.jade
|
||||
else if post_title == "part-of-speech-POS-tagger-in-python"
|
||||
include ./part-of-speech-POS-tagger-in-python/meta.jade
|
||||
else if post_title == "writing-c-in-cython"
|
||||
include ./writing-c-in-cython/meta.jade
|
||||
else if post_title == "how-spacy-works"
|
||||
include ./how-spacy-works/meta.jade
|
||||
else if post_title == "displacy"
|
||||
include ./displacy/meta.jade
|
||||
else if post_title == "eli5-computers-learn-reading"
|
||||
include ./eli5-computers-learn-reading/meta.jade
|
||||
else if post_title == "dead-code-should-be-buried"
|
||||
include ./dead-code-should-be-buried/meta.jade
|
||||
|
||||
- var Author = Authors[Meta.author_id]
|
||||
article.post
|
||||
header
|
||||
h2 #[a(href=Meta.url)= Meta.headline]
|
||||
+WriteByline(Author, Meta)
|
||||
|
||||
p #{Meta.description} #[a.readmore(href=Meta.url) ►]
|
||||
|
||||
include ../header.jade
|
||||
|
||||
- var Page = InitPage(Site, Authors.spacy, "blog", "Blog")
|
||||
- Page.active.blog = true
|
||||
|
||||
+WritePage(Site, Authors.spacy, Page)
|
||||
section.intro.profile
|
||||
p A lot of work has gone into #[strong spaCy], but no magic. We plan to keep no secrets. We want you to be able to #[a(href="/license") build your business] on #[strong spaCy] – so we want you to understand it. Tell us whether you do. #[span.social #[a(href="//twitter.com/" + Site.twitter, target="_blank") Twitter] #[a(href="mailto:contact@spacy.io") Contact us]]
|
||||
nav(role='navigation')
|
||||
ul
|
||||
li #[a.button(href='#blogs') Blog]
|
||||
li #[a.button(href='#tutorials') Tutorials]
|
||||
|
||||
h2 #[a.permalink(href='#blogs', name='blogs') Blog]
|
||||
|
||||
section.blogs
|
||||
+WriteTeaser(Authors, 'dead-code-should-be-buried')
|
||||
+WriteTeaser(Authors, 'eli5-computers-learn-reading')
|
||||
+WriteTeaser(Authors, 'displacy')
|
||||
+WriteTeaser(Authors, 'introducing-spacy')
|
||||
+WriteTeaser(Authors, 'how-spacy-works')
|
||||
+WriteTeaser(Authors, 'writing-c-in-cython')
|
||||
+WriteTeaser(Authors, 'parsing-english-in-python')
|
||||
+WriteTeaser(Authors, 'part-of-speech-POS-tagger-in-python')
|
||||
section.intro
|
||||
h2 #[a.permalink(href='#tutorials', name='tutorials') Tutorials]
|
||||
|
||||
section.tutorials
|
||||
include ../tutorials/_teaser.jade
|
22
website/src/jade/blog/introducing-spacy/index.jade
Normal file
22
website/src/jade/blog/introducing-spacy/index.jade
Normal file
|
@ -0,0 +1,22 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
|
||||
+WritePost(Meta)
|
||||
p spaCy is a new library for text processing in Python and Cython. I wrote it because I think small companies are terrible at natural language processing (NLP). Or rather: small companies are using terrible NLP technology.
|
||||
|
||||
p To do great NLP, you have to know a little about linguistics, a lot about machine learning, and almost everything about the latest research. The people who fit this description seldom join small companies. Most are broke – they've just finished grad school. If they don't want to stay in academia, they join Google, IBM, etc.
|
||||
|
||||
p The net result is that outside of the tech giants, commercial NLP has changed little in the last ten years. In academia, it's changed entirely. Amazing improvements in quality. Orders of magnitude faster. But the academic code is always GPL, undocumented, unuseable, or all three. You could implement the ideas yourself, but the papers are hard to read, and training data is exorbitantly expensive. So what are you left with? A common answer is NLTK, which was written primarily as an educational resource. Nothing past the tokenizer is suitable for production use.
|
||||
|
||||
p I used to think that the NLP community just needed to do more to communicate its findings to software engineers. So I wrote two blog posts, explaining #[a(href="https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/") how to write a part-of-speech tagger] and #[a(href="https://honnibal.wordpress.com/2013/12/18/a-simple-fast-algorithm-for-natural-language-dependency-parsing/") parser]. Both were well received, and there's been a bit of interest in #[a(href="https://github.com/syllog1sm/redshift/tree/develop") my research software] – even though it's entirely undocumented, and mostly unuseable to anyone but me.
|
||||
|
||||
p So six months ago I quit my post-doc, and I've been working day and night on spaCy since. I'm now pleased to announce an alpha release.
|
||||
|
||||
p If you're a small company doing NLP, I think spaCy will seem like a minor miracle. It's by far the fastest NLP software ever released. The full processing pipeline completes in 20ms per document, including accurate tagging and parsing. All strings are mapped to integer IDs, tokens are linked to embedded word representations, and a range of useful features are pre-calculated and cached.
|
||||
|
||||
blockquote.pull-quote
|
||||
+TweetThis("Computers don't understand text. This is unfortunate, because that's what the web is mostly made of.", Meta.url)
|
||||
|
||||
p If none of that made any sense to you, here's the gist of it. Computers don't understand text. This is unfortunate, because that's what the web almost entirely consists of. We want to recommend people text based on other text they liked. We want to shorten text to display it on a mobile screen. We want to aggregate it, link it, filter it, categorise it, generate it and correct it.
|
||||
p spaCy provides a library of utility functions that help programmers build such products. It's commercial open source software: you can either use it under the AGPL, or you can #[a(href="/license") buy a commercial license] under generous terms.
|
15
website/src/jade/blog/introducing-spacy/meta.jade
Normal file
15
website/src/jade/blog/introducing-spacy/meta.jade
Normal file
|
@ -0,0 +1,15 @@
|
|||
- var Meta = {}
|
||||
- Meta.author_id = "matt"
|
||||
- Meta.headline = "Introducing spaCy"
|
||||
- Meta.description = "Computers don't understand text. This is unfortunate, because that's what the web almost entirely consists of. We want to recommend people text based on other text they liked. We want to shorten text to display it on a mobile screen. We want to aggregate it, link it, filter it, categorise it, generate it and correct it. spaCy provides a library of utility functions that help programmers build such products."
|
||||
- Meta.date = "2015-02-19"
|
||||
- Meta.url = "/blog/introducing-spacy"
|
||||
- Meta.links = [{}, {}]
|
||||
- Meta.links[0].id = 'reddit'
|
||||
- Meta.links[0].name = "Reddit"
|
||||
- Meta.links[0].title = 'Reddit Thread'
|
||||
- Meta.links[0].url = "https://www.reddit.com/r/programming/comments/2tlyrr/spacy_industrialstrength_nlp_with_pythoncython"
|
||||
- Meta.links[1].id = 'hn'
|
||||
- Meta.links[1].name = "Hacker News Thread"
|
||||
- Meta.links[1].title = 'Hacker News'
|
||||
- Meta.links[1].url = "https://news.ycombinator.com/item?id=8942783"
|
535
website/src/jade/blog/parsing-english-in-python/index.html
Normal file
535
website/src/jade/blog/parsing-english-in-python/index.html
Normal file
|
@ -0,0 +1,535 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>Parsing English in 500 lines of Python | spaCy.io</title>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
|
||||
<meta name="description" content="This post explains how transition-based dependency parsers work, and argues that this algorithm represents a break-through in natural language understanding. A concise sample implementation is provided, in 500 lines of Python, with no external dependencies. This post was written in 2013. In 2015 this type of parser is now increasingly dominant.">
|
||||
<meta itemporop="name" content="Parsing English in 500 lines of Python">
|
||||
<meta itemprop="description" content="This post explains how transition-based dependency parsers work, and argues that this algorithm represents a break-through in natural language understanding. A concise sample implementation is provided, in 500 lines of Python, with no external dependencies. This post was written in 2013. In 2015 this type of parser is now increasingly dominant.">
|
||||
<meta itemprop="image" content="http://spacy.io/resources/img/social.png">
|
||||
<meta name="twitter:card" content="summary">
|
||||
<meta name="twitter:site" content="spacy_io">
|
||||
<meta name="twitter:title" content="Parsing English in 500 lines of Python">
|
||||
<meta name="twitter:description" content="This post explains how transition-based dependency parsers work, and argues that this algorithm represents a break-through in natural language understanding. A concise sample implementation is provided, in 500 lines of Python, with no external dependencies. This post was written in 2013. In 2015 this type of parser is now increasingly dominant.">
|
||||
<meta name="twitter:creator" content="@honnibal">
|
||||
<meta name="twitter:image" content="http://spacy.io/resources/img/social_small.png">
|
||||
<meta property="og:title" content="Parsing English in 500 lines of Python">
|
||||
<meta property="og:type" content="article">
|
||||
<meta property="og:url" content="http://spacy.io/blog/parsing-english-in-python">
|
||||
<meta property="og:image" content="http://spacy.io/resources/img/social.png">
|
||||
<meta property="og:description" content="This post explains how transition-based dependency parsers work, and argues that this algorithm represents a break-through in natural language understanding. A concise sample implementation is provided, in 500 lines of Python, with no external dependencies. This post was written in 2013. In 2015 this type of parser is now increasingly dominant.">
|
||||
<meta property="og:site_name" content="spaCy.io">
|
||||
<meta property="article:published_time" content="2013-12-18T00:00:00.000Z">
|
||||
<link rel="stylesheet" href="/resources/css/style.css">
|
||||
<!--[if lt IE 9]><script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]-->
|
||||
</head>
|
||||
<body id="blog">
|
||||
<header role="banner">
|
||||
<h1 class="logo">spaCy.io</h1>
|
||||
<div class="slogan">Blog
|
||||
</div>
|
||||
</header>
|
||||
<nav role="navigation">
|
||||
<li><a href="/">Home</a></li>
|
||||
<li><a href="/docs">Docs</a></li>
|
||||
<li><a href="/license">License</a></li>
|
||||
<li class="active"><a href="/blog">Blog</a></li>
|
||||
</nav>
|
||||
<main id="content">
|
||||
<article class="post">
|
||||
<header>
|
||||
<h2>Parsing English in 500 lines of Python</h2>
|
||||
<div class="subhead">by <a href="//twitter.com/honnibal" rel="author" target="_blank">Matthew Honnibal</a> on
|
||||
<time>December 18, 2013</time>
|
||||
</div>
|
||||
</header>
|
||||
<p class="box infobox"><strong class="note">2015-08-19 Update:</strong> I wrote this blog post in 2013, describing an exiciting advance in natural language understanding technology. Today, almost all high-performance parsers are using a variant of the algorithm described below (including spaCy). The original post is preserved below, with added commentary in light of recent research.</p>
|
||||
<p>A <a href="http://googleresearch.blogspot.de/2013/05/syntactic-ngrams-over-time.html">syntactic parser</a> describes a sentence’s grammatical structure, to help another application reason about it. Natural languages introduce many unexpected ambiguities, which our world-knowledge immediately filters out. A favourite example:</p>
|
||||
<p class="example">They ate the pizza with anchovies</p>
|
||||
<p><img src="/resources/img/anchovies.png" alt="Eat-with pizza-with ambiguity"></p>
|
||||
<p>A correct parse links “with” to “pizza”, while an incorrect parse links “with” to “eat”:</p>
|
||||
<div class="displacy">
|
||||
<iframe src="/resources/displacy/anchovies_bad.html" height="275"></iframe>
|
||||
</div>
|
||||
<div class="displacy">
|
||||
<iframe src="/resources/displacy/anchovies_good.html" height="275" class="displacy"></iframe>
|
||||
<p class="caption">Prepositional phrase attachment is a common source of errors for statistical parsers.</p>
|
||||
</div>
|
||||
<p>The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.</p>
|
||||
<p class="box infobox"><strong class="note">Update:</strong> CoreNLP now features high-performance transition-based models. It is much faster than the Redshift parser (my research system), but less accurate. spaCy is faster again still, more accurate than CoreNLP, but less accurate than Redshift, due to spaCy's use of greedy search. It would be relatively easy to provide a beam-search version of spaCy...But, I think the gap in accuracy will continue to close, especially given advances in neural network learning.</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Parser</th>
|
||||
<th>Accuracy</th>
|
||||
<th>Speed (w/s)</th>
|
||||
<th>Language</th>
|
||||
<th>LOC</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Stanford</td>
|
||||
<td>89.6%</td>
|
||||
<td>19</td>
|
||||
<td>Java</td>
|
||||
<td>> 4,000 <sup><a href="#note-1">[1]</a></sup></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>parser.py</strong></td>
|
||||
<td>89.8%</td>
|
||||
<td>2,020</td>
|
||||
<td>Python</td>
|
||||
<td><strong>~500</strong></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Redshift</td>
|
||||
<td><strong>93.6%</strong></td>
|
||||
<td><strong>2,580</strong></td>
|
||||
<td>Cython</td>
|
||||
<td>~4,000</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>The rest of the post sets up the problem, and then takes you through <a href="https://gist.github.com/syllog1sm/10343947">a concise implementation</a>, prepared for this post. The first 200 lines of parser.py, the part-of-speech tagger and learner, are described <a href="#">here</a>. You should probably at least skim that post before reading this one, unless you’re very familiar with NLP research.</p>
|
||||
<p>The Cython system, Redshift, was written for my current research. I plan to improve it for general use in June, after my contract ends at Macquarie University. The current version is <a href="http://github.com/syllog1sm/redshift">hosted on GitHub</a>.</p>
|
||||
<h3>Problem Description</h3>
|
||||
<p>It’d be nice to type an instruction like this into your phone:</p>
|
||||
<p class="example">Set volume to zero when I’m in a meeting, unless John’s school calls.</p>
|
||||
<p>And have it set the appropriate policy. On Android you can do this sort of thing with <a href="https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm">Tasker</a>, but an NL interface would be much better. It’d be especially nice to receive a meaning representation you could edit, so you could see what it thinks you said, and correct it.</p>
|
||||
<p>There are lots of problems to solve to make that work, but some sort of syntactic representation is definitely necessary. We need to know that:</p>
|
||||
<p class="example">Unless John’s school calls, when I’m in a meeting, set volume to zero</p>
|
||||
<p>is another way of phrasing the first instruction, while:</p>
|
||||
<p class="example">Unless John’s school, call when I’m in a meeting</p>
|
||||
<p>means something completely different.</p>
|
||||
<p>A dependency parser returns a graph of word-word relationships, intended to make such reasoning easier. Our graphs will be trees – edges will be directed, and every node (word) will have exactly one incoming arc (one dependency, with its head), except one.</p>
|
||||
<h4>Example usage</h4>
|
||||
<pre class="language-python"><code>parser = parser.Parser()
|
||||
tokens = "Set the volume to zero when I 'm in a meeting unless John 's school calls".split()
|
||||
>>> tags, heads = parser.parse(tokens)
|
||||
>>> heads
|
||||
[-1, 2, 0, 0, 3, 0, 7, 5, 7, 10, 8, 0, 13, 15, 15, 11]
|
||||
>>> for i, h in enumerate(heads):
|
||||
... head = tokens[heads[h]] if h >= 1 else 'None'
|
||||
... print(tokens[i] + ' <-- ' + head])
|
||||
Set <-- None
|
||||
the <-- volume
|
||||
volume <-- Set
|
||||
to <-- Set
|
||||
zero <-- to
|
||||
when <-- Set
|
||||
I <-- 'm
|
||||
'm <-- when
|
||||
in <-- 'm
|
||||
a <-- meeting
|
||||
meeting <-- in
|
||||
unless <-- Set
|
||||
John <-- 's
|
||||
's <-- calls
|
||||
school <-- calls
|
||||
calls <-- unless</code></pre>
|
||||
<p>The idea is that it should be slightly easier to reason from the parse, than it was from the string. The parse-to-meaning mapping is hopefully simpler than the string-to-meaning mapping.</p>
|
||||
<p>The most confusing thing about this problem area is that “correctness” is defined by convention — by annotation guidelines. If you haven’t read the guidelines and you’re not a linguist, you can’t tell whether the parse is “wrong” or “right”, which makes the whole task feel weird and artificial.</p>
|
||||
<p>For instance, there’s a mistake in the parse above: “John’s school calls” is structured wrongly, according to the Stanford annotation guidelines. The structure of that part of the sentence is how the annotators were instructed to parse an example like “John’s school clothes”.</p>
|
||||
<p>It’s worth dwelling on this point a bit. We could, in theory, have written our guidelines so that the “correct” parses were reversed. There’s good reason to believe the parsing task will be harder if we reversed our convention, as it’d be less consistent with the rest of the grammar. <sup><a href="#note-2">[2]</a></sup> But we could test that empirically, and we’d be pleased to gain an advantage by reversing the policy.</p>
|
||||
<p>We definitely do want that distinction in the guidelines — we don’t want both to receive the same structure, or our output will be less useful. The annotation guidelines strike a balance between what distinctions downstream applications will find useful, and what parsers will be able to predict easily.</p>
|
||||
<h4>Projective trees</h4>
|
||||
<p>There’s a particularly useful simplification that we can make, when deciding what we want the graph to look like: we can restrict the graph structures we’ll be dealing with. This doesn’t just give us a likely advantage in learnability; it can have deep algorithmic implications. We follow most work on English in constraining the dependency graphs to be <em>projective trees</em>:</p>
|
||||
<ol>
|
||||
<li>Tree. Every word has exactly one head, except for the dummy ROOT symbol.</li>
|
||||
<li>Projective. For every pair of dependencies (a1, a2) and (b1, b2), if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”. You can’t have a pair of dependencies that goes a1 b1 a2 b2, or b1 a1 b2 a2.</li>
|
||||
</ol>
|
||||
<p>There’s a rich literature on parsing non-projective trees, and a smaller literature on parsing DAGs. But the parsing algorithm I’ll be explaining deals with projective trees.</p>
|
||||
<h3>Greedy transition-based parsing</h3>
|
||||
<p>Our parser takes as input a list of string tokens, and outputs a list of head indices, representing edges in the graph. If the <em>i</em>th member of heads is <em>j</em>, the dependency parse contains an edge (j, i). A transition-based parser is a finite-state transducer; it maps an array of N words onto an output array of N head indices:</p>
|
||||
<table class="center">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><em>start</em></td>
|
||||
<td>MSNBC</td>
|
||||
<td>reported</td>
|
||||
<td>that</td>
|
||||
<td>Facebook</td>
|
||||
<td>bought</td>
|
||||
<td>WhatsApp</td>
|
||||
<td>for</td>
|
||||
<td>$16bn</td>
|
||||
<td><em>root</em></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>0</td>
|
||||
<td>2</td>
|
||||
<td>9</td>
|
||||
<td>2</td>
|
||||
<td>4</td>
|
||||
<td>2</td>
|
||||
<td>4</td>
|
||||
<td>4</td>
|
||||
<td>7</td>
|
||||
<td>0</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>The heads array denotes that the head of <em>MSNBC</em> is <em>reported</em>:
|
||||
<MSNBC></MSNBC> is word 1, and <em>reported</em> is word 2, and <code class="language-python">heads[1] == 2</code>. You can already see why parsing a tree is handy — this data structure wouldn’t work if we had to output a DAG, where words may have multiple heads.
|
||||
</p>
|
||||
<p>Although <code class="language-python">heads</code> can be represented as an array, we’d actually like to maintain some alternate ways to access the parse, to make it easy and efficient to extract features. Our <code class="language-python">Parse</code> class looks like this:</p>
|
||||
<pre class="language-python"><code>class Parse(object):
|
||||
def __init__(self, n):
|
||||
self.n = n
|
||||
self.heads = [None] * (n-1)
|
||||
self.lefts = []
|
||||
self.rights = []
|
||||
for i in range(n+1):
|
||||
self.lefts.append(DefaultList(0))
|
||||
self.rights.append(DefaultList(0))
|
||||
|
||||
def add_arc(self, head, child):
|
||||
self.heads[child] = head
|
||||
if child < head:
|
||||
self.lefts[head].append(child)
|
||||
else:
|
||||
self.rights[head].append(child)</code></pre>
|
||||
<p>As well as the parse, we also have to keep track of where we’re up to in the sentence. We’ll do this with an index into the <code class="language-python">words</code> array, and a stack, to which we’ll push words, before popping them once their head is set. So our state data structure is fundamentally:</p>
|
||||
<ul>
|
||||
<li>An index, i, into the list of tokens;</li>
|
||||
<li>The dependencies added so far, in Parse</li>
|
||||
<li>A stack, containing words that occurred before i, for which we’re yet to assign a head.</li>
|
||||
</ul>
|
||||
<p>Each step of the parsing process applies one of three actions to the state:</p>
|
||||
<pre class="language-python"><code>SHIFT = 0; RIGHT = 1; LEFT = 2
|
||||
MOVES = [SHIFT, RIGHT, LEFT]
|
||||
|
||||
def transition(move, i, stack, parse):
|
||||
global SHIFT, RIGHT, LEFT
|
||||
if move == SHIFT:
|
||||
stack.append(i)
|
||||
return i + 1
|
||||
elif move == RIGHT:
|
||||
parse.add_arc(stack[-2], stack.pop())
|
||||
return i
|
||||
elif move == LEFT:
|
||||
parse.add_arc(i, stack.pop())
|
||||
return i
|
||||
raise GrammarError("Unknown move: %d" % move)</code></pre>
|
||||
<p>The <code class="language-python">LEFT</code> and <code class="language-python">RIGHT</code> actions add dependencies and pop the stack, while <code class="language-python">SHIFT</code> pushes the stack and advances i into the buffer.</p>
|
||||
<p>So, the parser starts with an empty stack, and a buffer index at 0, with no dependencies recorded. It chooses one of the (valid) actions, and applies it to the state. It continues choosing actions and applying them until the stack is empty and the buffer index is at the end of the input. (It’s hard to understand this sort of algorithm without stepping through it. Try coming up with a sentence, drawing a projective parse tree over it, and then try to reach the parse tree by choosing the right sequence of transitions.)</p>
|
||||
<p>Here’s what the parsing loop looks like in code:</p>
|
||||
<pre class="language-python"><code>class Parser(object):
|
||||
...
|
||||
def parse(self, words):
|
||||
tags = self.tagger(words)
|
||||
n = len(words)
|
||||
idx = 1
|
||||
stack = [0]
|
||||
deps = Parse(n)
|
||||
while stack or idx < n:
|
||||
features = extract_features(words, tags, idx, n, stack, deps)
|
||||
scores = self.model.score(features)
|
||||
valid_moves = get_valid_moves(i, n, len(stack))
|
||||
next_move = max(valid_moves, key=lambda move: scores[move])
|
||||
idx = transition(next_move, idx, stack, parse)
|
||||
return tags, parse
|
||||
|
||||
def get_valid_moves(i, n, stack_depth):
|
||||
moves = []
|
||||
if i < n:
|
||||
moves.append(SHIFT)
|
||||
if stack_depth <= 2:
|
||||
moves.append(RIGHT)
|
||||
if stack_depth <= 1:
|
||||
moves.append(LEFT)
|
||||
return moves</code></pre>
|
||||
<p>We start by tagging the sentence, and initializing the state. We then map the state to a set of features, which we score using a linear model. We then find the best-scoring valid move, and apply it to the state.</p>
|
||||
<p>The model scoring works the same as it did in <a href="#">the POS tagger</a>. If you’re confused about the idea of extracting features and scoring them with a linear model, you should review that post. Here’s a reminder of how the model scoring works:</p>
|
||||
<pre class="language-python"><code>class Perceptron(object)
|
||||
...
|
||||
def score(self, features):
|
||||
all_weights = self.weights
|
||||
scores = dict((clas, 0) for clas in self.classes)
|
||||
for feat, value in features.items():
|
||||
if value == 0:
|
||||
continue
|
||||
if feat not in all_weights:
|
||||
continue
|
||||
weights = all_weights[feat]
|
||||
for clas, weight in weights.items():
|
||||
scores[clas] += value * weight
|
||||
return scores</code></pre>
|
||||
<p>It’s just summing the class-weights for each feature. This is often expressed as a dot-product, but when you’re dealing with multiple classes, that gets awkward, I find.</p>
|
||||
<p>The beam parser (RedShift) tracks multiple candidates, and only decides on the best one at the very end. We’re going to trade away accuracy in favour of efficiency and simplicity. We’ll only follow a single analysis. Our search strategy will be entirely greedy, as it was with the POS tagger. We’ll lock-in our choices at every step.</p>
|
||||
<p>If you read the POS tagger post carefully, you might see the underlying similarity. What we’ve done is mapped the parsing problem onto a sequence-labelling problem, which we address using a “flat”, or unstructured, learning algorithm (by doing greedy search).</p>
|
||||
<h3>Features</h3>
|
||||
<p>Feature extraction code is always pretty ugly. The features for the parser refer to a few tokens from the context:</p>
|
||||
<ul>
|
||||
<li>The first three words of the buffer (n0, n1, n2)</li>
|
||||
<li>The top three words of the stack (s0, s1, s2)</li>
|
||||
<li>The two leftmost children of s0 (s0b1, s0b2);</li>
|
||||
<li>The two rightmost children of s0 (s0f1, s0f2);</li>
|
||||
<li>The two leftmost children of n0 (n0b1, n0b2)</li>
|
||||
</ul>
|
||||
<p>For these 12 tokens, we refer to the word-form, the part-of-speech tag, and the number of left and right children attached to the token.</p>
|
||||
<p>Because we’re using a linear model, we have our features refer to pairs and triples of these atomic properties.</p>
|
||||
<pre class="language-python"><code>def extract_features(words, tags, n0, n, stack, parse):
|
||||
def get_stack_context(depth, stack, data):
|
||||
if depth >= 3:
|
||||
return data[stack[-1]], data[stack[-2]], data[stack[-3]]
|
||||
elif depth >= 2:
|
||||
return data[stack[-1]], data[stack[-2]], ''
|
||||
elif depth == 1:
|
||||
return data[stack[-1]], '', ''
|
||||
else:
|
||||
return '', '', ''
|
||||
|
||||
def get_buffer_context(i, n, data):
|
||||
if i + 1 >= n:
|
||||
return data[i], '', ''
|
||||
elif i + 2 >= n:
|
||||
return data[i], data[i + 1], ''
|
||||
else:
|
||||
return data[i], data[i + 1], data[i + 2]
|
||||
|
||||
def get_parse_context(word, deps, data):
|
||||
if word == -1:
|
||||
return 0, '', ''
|
||||
deps = deps[word]
|
||||
valency = len(deps)
|
||||
if not valency:
|
||||
return 0, '', ''
|
||||
elif valency == 1:
|
||||
return 1, data[deps[-1]], ''
|
||||
else:
|
||||
return valency, data[deps[-1]], data[deps[-2]]
|
||||
|
||||
features = {}
|
||||
# Set up the context pieces --- the word, W, and tag, T, of:
|
||||
# S0-2: Top three words on the stack
|
||||
# N0-2: First three words of the buffer
|
||||
# n0b1, n0b2: Two leftmost children of the first word of the buffer
|
||||
# s0b1, s0b2: Two leftmost children of the top word of the stack
|
||||
# s0f1, s0f2: Two rightmost children of the top word of the stack
|
||||
|
||||
depth = len(stack)
|
||||
s0 = stack[-1] if depth else -1
|
||||
|
||||
Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words)
|
||||
Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags)
|
||||
|
||||
Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words)
|
||||
Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags)
|
||||
|
||||
Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words)
|
||||
Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags)
|
||||
|
||||
Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words)
|
||||
_, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags)
|
||||
|
||||
Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words)
|
||||
_, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags)
|
||||
|
||||
Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words)
|
||||
_, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags)
|
||||
|
||||
# Cap numeric features at 5?
|
||||
# String-distance
|
||||
Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0
|
||||
|
||||
features['bias'] = 1
|
||||
# Add word and tag unigrams
|
||||
for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2):
|
||||
if w:
|
||||
features['w=%s' % w] = 1
|
||||
for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2):
|
||||
if t:
|
||||
features['t=%s' % t] = 1
|
||||
|
||||
# Add word/tag pairs
|
||||
for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))):
|
||||
if w or t:
|
||||
features['%d w=%s, t=%s' % (i, w, t)] = 1
|
||||
|
||||
# Add some bigrams
|
||||
features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1
|
||||
features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1
|
||||
features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1
|
||||
features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1
|
||||
features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1
|
||||
features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1
|
||||
features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1
|
||||
features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1
|
||||
|
||||
# Add some tag trigrams
|
||||
trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0),
|
||||
(Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1),
|
||||
(Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2),
|
||||
(Ts0, Ts1, Ts1))
|
||||
for i, (t1, t2, t3) in enumerate(trigrams):
|
||||
if t1 or t2 or t3:
|
||||
features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1
|
||||
|
||||
# Add some valency and distance features
|
||||
vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b))
|
||||
vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b))
|
||||
d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0),
|
||||
('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0))
|
||||
for i, (w_t, v_d) in enumerate(vw + vt + d):
|
||||
if w_t or v_d:
|
||||
features['val/d-%d %s %d' % (i, w_t, v_d)] = 1
|
||||
return features</code></pre>
|
||||
<h3>Training</h3>
|
||||
<p>Weights are learned using the same algorithm, averaged perceptron, that we used for part-of-speech tagging. Its key strength is that it’s an online learning algorithm: examples stream in one-by-one, we make our prediction, check the actual answer, and adjust our beliefs (weights) if we were wrong.</p>
|
||||
<p>The training loop looks like this:</p>
|
||||
<pre class="language-python"><code>class Parser(object):
|
||||
...
|
||||
def train_one(self, itn, words, gold_tags, gold_heads):
|
||||
n = len(words)
|
||||
i = 2; stack = [1]; parse = Parse(n)
|
||||
tags = self.tagger.tag(words)
|
||||
while stack or (i + 1) < n:
|
||||
features = extract_features(words, tags, i, n, stack, parse)
|
||||
scores = self.model.score(features)
|
||||
valid_moves = get_valid_moves(i, n, len(stack))
|
||||
guess = max(valid_moves, key=lambda move: scores[move])
|
||||
gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads)
|
||||
best = max(gold_moves, key=lambda move: scores[move])
|
||||
self.model.update(best, guess, features)
|
||||
i = transition(guess, i, stack, parse)
|
||||
# Return number correct
|
||||
return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]])</code></pre>
|
||||
<p>The most interesting part of the training process is in <code class="language-python">get_gold_moves</code>. The performance of our parser is made possible by an advance by Goldberg and Nivre (2012), who showed that we’d been doing this wrong for years.</p>
|
||||
<p class="box infobox"><strong class="note">2015-08-19 Update:</strong> Interesting, CoreNLP continues to "do it wrong" – their transition-based parser uses the static-oracle, rather than the dynamic oracle described here. I attribute spaCy's accuracy advantage to this difference in training algorithm. The ClearNLP parser uses an iterative algorithm that achieves the same sort of thing (and was published prior to the dynamic oracle). I find the dynamic oracle idea much more conceptually clear.</p>
|
||||
<p>In the POS-tagging post, I cautioned that during training you need to make sure you pass in the last two <em>predicted</em> tags as features for the current tag, not the last two <em>gold</em> tags. At test time you’ll only have the predicted tags, so if you base your features on the gold sequence during training, your training contexts won’t resemble your test-time contexts, so you’ll learn the wrong weights.</p>
|
||||
<p>In parsing, the problem was that we didn’t know <em>how</em> to pass in the predicted sequence! Training worked by taking the gold-standard tree, and finding a transition sequence that led to it. i.e., you got back a sequence of moves, with the guarantee that if you followed those moves, you’d get the gold-standard dependencies.</p>
|
||||
<p>The problem is, we didn’t know how to define the “correct” move to teach a parser to make if it was in any state that <em>wasn’t</em> along that gold-standard sequence. Once the parser had made a mistake, we didn’t know how to train from that example.</p>
|
||||
<p>That was a big problem, because it meant that once the parser started making mistakes, it would end up in states unlike any in its training data – leading to yet more mistakes. The problem was specific to greedy parsers: once you use a beam, there’s a natural way to do structured prediction.</p>
|
||||
<p class="box infobox"><strong class="note">2015-08-19 Update:</strong> It's since been pointed out to me that what we're calling a "dynamic oracle" here is really a form of <a href="http://www.ausy.tu-darmstadt.de/Research/ICML2011">imitation learning</a>.</p>
|
||||
<p>The solution seems obvious once you know it, like all the best breakthroughs. What we do is define a function that asks “How many gold-standard dependencies can be recovered from this state?”. If you can define that function, then you can apply each move in turn, and ask, “How many gold-standard dependencies can be recovered from <em>this</em> state?”. If the action you applied allows <em>fewer</em> gold-standard dependencies to be reached, then it is sub-optimal.</p>
|
||||
<p>That’s a lot to take in.</p>
|
||||
<p>So we have this function <code>Oracle(state)</code>:
|
||||
<pre><code>Oracle(state) = | gold_arcs ∩ reachable_arcs(state) |</code></pre>
|
||||
</p>
|
||||
<p>We also have a set of actions, each of which returns a new state. We want to know:</p>
|
||||
<ul>
|
||||
<li><code>shift_cost = Oracle(state) – Oracle(shift(state))</code></li>
|
||||
<li><code>right_cost = Oracle(state) – Oracle(right(state))</code></li>
|
||||
<li><code>left_cost = Oracle(state) – Oracle(left(state))</code></li>
|
||||
</ul>
|
||||
<p>Now, at least one of those costs <em>has</em> to be zero. Oracle(state) is asking, “what’s the cost of the best path forward?”, and the first action of that best path has to be shift, right, or left.</p>
|
||||
<p>It turns out that we can derive Oracle fairly simply for many transition systems. The derivation for the transition system we’re using, Arc Hybrid, is in Goldberg and Nivre (2013).</p>
|
||||
<p>We’re going to implement the oracle as a function that returns the zero-cost moves, rather than implementing a function Oracle(state). This prevents us from doing a bunch of costly copy operations. Hopefully the reasoning in the code isn’t too hard to follow, but you can also consult Goldberg and Nivre’s papers if you’re confused and want to get to the bottom of this.</p>
|
||||
<pre class="language-python"><code>def get_gold_moves(n0, n, stack, heads, gold):
|
||||
def deps_between(target, others, gold):
|
||||
for word in others:
|
||||
if gold[word] == target or gold[target] == word:
|
||||
return True
|
||||
return False
|
||||
|
||||
valid = get_valid_moves(n0, n, len(stack))
|
||||
if not stack or (SHIFT in valid and gold[n0] == stack[-1]):
|
||||
return [SHIFT]
|
||||
if gold[stack[-1]] == n0:
|
||||
return [LEFT]
|
||||
costly = set([m for m in MOVES if m not in valid])
|
||||
# If the word behind s0 is its gold head, Left is incorrect
|
||||
if len(stack) >= 2 and gold[stack[-1]] == stack[-2]:
|
||||
costly.add(LEFT)
|
||||
# If there are any dependencies between n0 and the stack,
|
||||
# pushing n0 will lose them.
|
||||
if SHIFT not in costly and deps_between(n0, stack, gold):
|
||||
costly.add(SHIFT)
|
||||
# If there are any dependencies between s0 and the buffer, popping
|
||||
# s0 will lose them.
|
||||
if deps_between(stack[-1], range(n0+1, n-1), gold):
|
||||
costly.add(LEFT)
|
||||
costly.add(RIGHT)
|
||||
return [m for m in MOVES if m not in costly]</code></pre>
|
||||
<p>Doing this “dynamic oracle” training procedure makes a big difference to accuracy — typically 1-2%, with no difference to the way the run-time works. The old “static oracle” greedy training procedure is fully obsolete; there’s no reason to do it that way any more.</p>
|
||||
<h3>Conclusion</h3>
|
||||
<p>I have the sense that language technologies, particularly those relating to grammar, are particularly mysterious. I can imagine having no idea what the program might even do.</p>
|
||||
<p>I think it therefore seems natural to people that the best solutions would be over-whelmingly complicated. A 200,000 line Java package feels appropriate.</p>
|
||||
<p>But, algorithmic code is usually short, when only a single algorithm is implemented. And when you only implement one algorithm, and you know exactly what you want to write before you write a line, you also don’t pay for any unnecessary abstractions, which can have a big performance impact.</p>
|
||||
<h3>Notes</h3>
|
||||
<p><a name="note-1"></a> [1] I wasn’t really sure how to count the lines of code in the Stanford parser. Its jar file ships over 200k, but there are a lot of different models in it. It’s not important, but it's certainly over 4k.</p>
|
||||
<p><a name="note-2"></a> [2] For instance, how would you parse, “John’s school of music calls”? You want to make sure the phrase “John’s school” has a consistent structure in both “John’s school calls” and “John’s school of music calls”. Reasoning about the different “slots” you can put a phrase into is a key way we reason about what syntactic analyses look like. You can think of each phrase as having a different shaped connector, which you need to plug into different slots — which each phrase also has a certain number of, each of a different shape. We’re trying to figure out what connectors are where, so we can figure out how the sentences are put together.</p>
|
||||
<h3>Idle speculation</h3>
|
||||
<p>For a long time, incremental language processing algorithms were primarily of scientific interest. If you want to write a parser to test a theory about how the human sentence processor might work, well, that parser needs to build partial interpretations. There’s a wealth of evidence, including commonsense introspection, that establishes that we don’t buffer input and analyse it once the speaker has finished.</p>
|
||||
<p>But now algorithms with that neat scientific feature are winning! As best as I can tell, the secret to that success is to be:</p>
|
||||
<ul>
|
||||
<li>Incremental. Earlier words constrain the search.</li>
|
||||
<li>Error-driven. Training involves a working hypothesis, which is updated as it makes mistakes.</li>
|
||||
</ul>
|
||||
<p>The links to human sentence processing seem tantalising. I look forward to seeing whether these engineering breakthroughs lead to any psycholinguistic advances.</p>
|
||||
<h3>Bibliography</h3>
|
||||
<p>The NLP literature is almost entirely open access. All of the relavant papers can be found <a href="http://aclweb.org/anthology/" rel="nofollow">here</a>.</p>
|
||||
<p>The parser I’ve described is an implementation of the dynamic-oracle Arc-Hybrid system here:<span class="bib-item">Goldberg, Yoav; Nivre, Joakim. <em>Training Deterministic Parsers with Non-Deterministic Oracles</em>. TACL 2013</span></p>
|
||||
<p>However, I wrote my own features for it. The arc-hybrid system was originally described here:<span class="bib-item">Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic programming algorithms for transition-based dependency parsers. ACL 2011</span></p>
|
||||
<p>The dynamic oracle training method was first described here:<span class="bib-item">A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav; Nivre, Joakim. COLING 2012</span></p>
|
||||
<p>This work depended on a big break-through in accuracy for transition-based parsers, when beam-search was properly explored by Zhang and Clark. They have several papers, but the preferred citation is:<span class="bib-item">Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized Perceptron and Beam Search. Computational Linguistics 2011 (1)</span></p>
|
||||
<p>Another important paper was this little feature engineering paper, which further improved the accuracy:<span class="bib-item">Zhang, Yue; Nivre, Joakim. Transition-based Dependency Parsing with Rich Non-local Features. ACL 2011</span></p>
|
||||
<p>The generalised perceptron, which is the learning framework for these beam parsers, is from this paper:<span class="bib-item">Collins, Michael. Discriminative Training Methods for Hidden Markov Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002</span></p>
|
||||
<h3>Experimental details</h3>
|
||||
<p>The results at the start of the post refer to Section 22 of the Wall Street Journal corpus. The Stanford parser was run as follows:</p>
|
||||
<pre class="language-bash"><code>java -mx10000m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \
|
||||
-outputFormat "penn" edu/stanford/nlp/models/lexparser/englishFactored.ser.gz $*</code></pre>
|
||||
<p>A small post-process was applied, to undo the fancy tokenisation Stanford adds for numbers, to make them match the PTB tokenisation:</p>
|
||||
<pre class="language-python"><code>"""Stanford parser retokenises numbers. Split them."""
|
||||
import sys
|
||||
import re
|
||||
|
||||
qp_re = re.compile('\xc2\xa0')
|
||||
for line in sys.stdin:
|
||||
line = line.rstrip()
|
||||
if qp_re.search(line):
|
||||
line = line.replace('(CD', '(QP (CD', 1) + ')'
|
||||
line = line.replace('\xc2\xa0', ') (CD ')
|
||||
print line</code></pre>
|
||||
<p>The resulting PTB-format files were then converted into dependencies using the Stanford converter:</p>
|
||||
<pre class="language-bash"><code>./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp
|
||||
./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/
|
||||
./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll</code></pre>
|
||||
<p>I can’t easily read that anymore, but it should just convert every .mrg file in a folder to a CoNLL-format Stanford basic dependencies file, using the settings common in the dependency literature.</p>
|
||||
<p>I then converted the gold-standard trees from WSJ 22, for the evaluation. Accuracy scores refer to unlabelled attachment score (i.e. the head index) of all non-punctuation tokens.</p>
|
||||
<p>To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21 into the same conversion script.</p>
|
||||
<p>In a nutshell: The Stanford model and parser.py are trained on the same set of sentences, and they each make their predictions on a held-out test set, for which we know the answers. Accuracy refers to how many of the words’ heads we got correct.</p>
|
||||
<p>Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a server, to give the Stanford parser more memory. The parser.py system runs fine on my MacBook Air. I used PyPy for the parser.py experiments; CPython was about half as fast on an early benchmark.</p>
|
||||
<p>One of the reasons parser.py is so fast is that it does unlabelled parsing. Based on previous experiments, a labelled parser would likely be about 40x slower, and about 1% more accurate. Adapting the program to labelled parsing would be a good exercise for the reader, if you have access to the data.</p>
|
||||
<p>The result from the Redshift parser was produced from commit <code class="language-python">b6b624c9900f3bf</code>, which was run as follows:</p>
|
||||
<pre class="language-bash"><code>./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp
|
||||
./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/
|
||||
./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll</code></pre>
|
||||
<footer role="contentinfo" class="meta"><a href="http://twitter.com/share?text=Parsing English in 500 lines of Python&url=http://spacy.io/blog/parsing-english-in-python&via=spacy_io" title="Share on Twitter" target="_blank" class="button button-twitter">Share on Twitter </a>
|
||||
<div class="discuss"> <a href="https://www.reddit.com/r/programming/comments/245jte/parsing_english_with_500_lines_of_python/" title="Discuss on Reddit" class="button button-reddit">Reddit Thread</a> <a href="https://news.ycombinator.com/item?id=7658864" title="Discuss on Hacker News Thread" class="button button-hn">Hacker News</a>
|
||||
</div>
|
||||
<section class="intro profile">
|
||||
<p><img src="/resources/img/matt.png"> Matthew Honnibal is the author of the spaCy software and the sole founder of its parent company. He studied linguistics as an undergrad, and never thought he'd be a programmer. By 2009 he had a PhD in computer science, and in 2014 he left academia to found Syllogism Co. He's from Sydney and lives in Berlin. <span class="social"><a href="//twitter.com/honnibal" target="_blank">Twitter</a></span></p>
|
||||
</section>
|
||||
</footer>
|
||||
</article>
|
||||
</main>
|
||||
<script src="/resources/js/prism.min.js"></script>
|
||||
<!-- Details polyfill-->
|
||||
<script>
|
||||
var details = document.getElementsByTagName("details");
|
||||
var summary = document.getElementsByTagName("summary");
|
||||
for(var i = 0; i < details.length; i++) {
|
||||
(details[i].getAttribute("open") == null) ? details[i].setAttribute("data-open", "false") : details[i].setAttribute("data-open", "true");
|
||||
}
|
||||
for(var i = 0; i < summary.length; i++) {
|
||||
summary[i].addEventListener( "click", function(e) {
|
||||
var parent = this.parentElement;
|
||||
(parent.getAttribute("data-open") == "false") ? parent.setAttribute("data-open", "true") : parent.setAttribute("data-open", "false");
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<script>
|
||||
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
|
||||
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
|
||||
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
|
||||
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
|
||||
ga('create', 'UA-58931649-1', 'auto');
|
||||
ga('send', 'pageview');
|
||||
</script>
|
||||
<footer role="contentinfo"><span class="slogan copyright">© 2015 Syllogism Co. | <a href="mailto:contact@spacy.io">Contact</a></span></footer>
|
||||
</body>
|
||||
</html>
|
621
website/src/jade/blog/parsing-english-in-python/index.jade
Normal file
621
website/src/jade/blog/parsing-english-in-python/index.jade
Normal file
|
@ -0,0 +1,621 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
+WritePost(Meta)
|
||||
p.box.infobox #[strong.note 2015-08-19 Update:] I wrote this blog post in 2013, describing an exciting advance in natural language understanding technology. Today, almost all high-performance parsers are using a variant of the algorithm described below (including spaCy). The original post is preserved below, with added commentary in light of recent research.
|
||||
|
||||
p A #[a(href="http://googleresearch.blogspot.de/2013/05/syntactic-ngrams-over-time.html") syntactic parser] describes a sentence’s grammatical structure, to help another application reason about it. Natural languages introduce many unexpected ambiguities, which our world-knowledge immediately filters out. A favourite example:
|
||||
|
||||
p.example They ate the pizza with anchovies
|
||||
|
||||
p #[img(src='/resources/img/anchovies.png', alt='Eat-with pizza-with ambiguity')]
|
||||
|
||||
p A correct parse links “with” to “pizza”, while an incorrect parse links “with” to “eat”:
|
||||
|
||||
.displacy
|
||||
iframe(src='/resources/displacy/anchovies_bad.html', height='275')
|
||||
|
||||
.displacy
|
||||
iframe.displacy(src='/resources/displacy/anchovies_good.html', height='275')
|
||||
p.caption Prepositional phrase attachment is a common source of errors for statistical parsers.
|
||||
|
||||
p The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.
|
||||
|
||||
table
|
||||
thead
|
||||
tr
|
||||
th Parser
|
||||
th Accuracy
|
||||
th Speed (w/s)
|
||||
th Language
|
||||
th LOC
|
||||
tr
|
||||
td Stanford PCFG
|
||||
td 89.6%
|
||||
td 19
|
||||
td Java
|
||||
td > 4,000 #[sup: #[a(href='#note-1') [1]]]
|
||||
|
||||
tr
|
||||
td #[strong parser.py]
|
||||
td 89.8%
|
||||
td 2,020
|
||||
td Python
|
||||
td #[strong ~500]
|
||||
|
||||
tr
|
||||
td Redshift
|
||||
td #[strong 93.6%]
|
||||
td #[strong 2,580]
|
||||
td Cython
|
||||
td ~4,000
|
||||
|
||||
|
||||
table.box.infobox
|
||||
thead
|
||||
tr
|
||||
th Parser
|
||||
th Accuracy
|
||||
th Speed (w/s)
|
||||
th Language
|
||||
th LOC
|
||||
tbody
|
||||
tr
|
||||
td spaCy v0.89
|
||||
td 92.7%
|
||||
td 22,106
|
||||
td Cython
|
||||
td ~10,000
|
||||
tbody
|
||||
tr
|
||||
td Stanford NN
|
||||
td 91.7%
|
||||
td 16,800
|
||||
td Java
|
||||
td > 4,000 #[sup: #[a(href='#note-1') [1]]]
|
||||
|
||||
p.box.infobox #[strong.note Update:] Stanford's CoreNLP now features high-performance transition-based models. It is much faster than the Redshift parser (my research system), but less accurate. spaCy is faster again still, more accurate than CoreNLP, but less accurate than Redshift, due to spaCy's use of greedy search. It would be relatively easy to provide a beam-search version of spaCy...But, I think the gap in accuracy will continue to close, especially given advances in neural network learning.
|
||||
|
||||
|
||||
p The rest of the post sets up the problem, and then takes you through #[a(href="https://gist.github.com/syllog1sm/10343947") a concise implementation], prepared for this post. The first 200 lines of parser.py, the part-of-speech tagger and learner, are described #[a(href="#") here]. You should probably at least skim that post before reading this one, unless you’re very familiar with NLP research.
|
||||
|
||||
p The Cython system, Redshift, was written for my current research. I plan to improve it for general use in June, after my contract ends at Macquarie University. The current version is #[a(href="http://github.com/syllog1sm/redshift") hosted on GitHub].
|
||||
|
||||
h3 Problem Description
|
||||
|
||||
p It’d be nice to type an instruction like this into your phone:
|
||||
|
||||
p.example Set volume to zero when I’m in a meeting, unless John’s school calls.
|
||||
|
||||
p And have it set the appropriate policy. On Android you can do this sort of thing with #[a(href="https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm") Tasker], but an NL interface would be much better. It’d be especially nice to receive a meaning representation you could edit, so you could see what it thinks you said, and correct it.
|
||||
|
||||
p There are lots of problems to solve to make that work, but some sort of syntactic representation is definitely necessary. We need to know that:
|
||||
|
||||
p.example Unless John’s school calls, when I’m in a meeting, set volume to zero
|
||||
|
||||
p is another way of phrasing the first instruction, while:
|
||||
|
||||
p.example Unless John’s school, call when I’m in a meeting
|
||||
|
||||
p means something completely different.
|
||||
|
||||
p A dependency parser returns a graph of word-word relationships, intended to make such reasoning easier. Our graphs will be trees – edges will be directed, and every node (word) will have exactly one incoming arc (one dependency, with its head), except one.
|
||||
|
||||
h4 Example usage
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| parser = parser.Parser()
|
||||
| tokens = "Set the volume to zero when I 'm in a meeting unless John 's school calls".split()
|
||||
| >>> tags, heads = parser.parse(tokens)
|
||||
| >>> heads
|
||||
| [-1, 2, 0, 0, 3, 0, 7, 5, 7, 10, 8, 0, 13, 15, 15, 11]
|
||||
| >>> for i, h in enumerate(heads):
|
||||
| ... head = tokens[heads[h]] if h >= 1 else 'None'
|
||||
| ... print(tokens[i] + ' <-- ' + head])
|
||||
| Set <-- None
|
||||
| the <-- volume
|
||||
| volume <-- Set
|
||||
| to <-- Set
|
||||
| zero <-- to
|
||||
| when <-- Set
|
||||
| I <-- 'm
|
||||
| 'm <-- when
|
||||
| in <-- 'm
|
||||
| a <-- meeting
|
||||
| meeting <-- in
|
||||
| unless <-- Set
|
||||
| John <-- 's
|
||||
| 's <-- calls
|
||||
| school <-- calls
|
||||
| calls <-- unless
|
||||
|
||||
p The idea is that it should be slightly easier to reason from the parse, than it was from the string. The parse-to-meaning mapping is hopefully simpler than the string-to-meaning mapping.
|
||||
|
||||
p The most confusing thing about this problem area is that “correctness” is defined by convention — by annotation guidelines. If you haven’t read the guidelines and you’re not a linguist, you can’t tell whether the parse is “wrong” or “right”, which makes the whole task feel weird and artificial.
|
||||
|
||||
p For instance, there’s a mistake in the parse above: “John’s school calls” is structured wrongly, according to the Stanford annotation guidelines. The structure of that part of the sentence is how the annotators were instructed to parse an example like “John’s school clothes”.
|
||||
|
||||
p It’s worth dwelling on this point a bit. We could, in theory, have written our guidelines so that the “correct” parses were reversed. There’s good reason to believe the parsing task will be harder if we reversed our convention, as it’d be less consistent with the rest of the grammar. #[sup: #[a(href='#note-2') [2]]] But we could test that empirically, and we’d be pleased to gain an advantage by reversing the policy.
|
||||
|
||||
p We definitely do want that distinction in the guidelines — we don’t want both to receive the same structure, or our output will be less useful. The annotation guidelines strike a balance between what distinctions downstream applications will find useful, and what parsers will be able to predict easily.
|
||||
|
||||
h4 Projective trees
|
||||
|
||||
p There’s a particularly useful simplification that we can make, when deciding what we want the graph to look like: we can restrict the graph structures we’ll be dealing with. This doesn’t just give us a likely advantage in learnability; it can have deep algorithmic implications. We follow most work on English in constraining the dependency graphs to be #[em projective trees]:
|
||||
|
||||
ol
|
||||
li Tree. Every word has exactly one head, except for the dummy ROOT symbol.
|
||||
li Projective. For every pair of dependencies (a1, a2) and (b1, b2), if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”. You can’t have a pair of dependencies that goes a1 b1 a2 b2, or b1 a1 b2 a2.
|
||||
|
||||
p There’s a rich literature on parsing non-projective trees, and a smaller literature on parsing DAGs. But the parsing algorithm I’ll be explaining deals with projective trees.
|
||||
|
||||
h3 Greedy transition-based parsing
|
||||
|
||||
p Our parser takes as input a list of string tokens, and outputs a list of head indices, representing edges in the graph. If the #[em i]th member of heads is #[em j], the dependency parse contains an edge (j, i). A transition-based parser is a finite-state transducer; it maps an array of N words onto an output array of N head indices:
|
||||
|
||||
table.center
|
||||
tbody
|
||||
tr
|
||||
td
|
||||
em start
|
||||
td MSNBC
|
||||
td reported
|
||||
td that
|
||||
td Facebook
|
||||
td bought
|
||||
td WhatsApp
|
||||
td for
|
||||
td $16bn
|
||||
td
|
||||
em root
|
||||
tr
|
||||
td 0
|
||||
td 2
|
||||
td 9
|
||||
td 2
|
||||
td 4
|
||||
td 2
|
||||
td 4
|
||||
td 4
|
||||
td 7
|
||||
td 0
|
||||
|
||||
p The heads array denotes that the head of #[em MSNBC] is #[em reported]: #[MSNBC] is word 1, and #[em reported] is word 2, and #[code.language-python heads[1] == 2]. You can already see why parsing a tree is handy — this data structure wouldn’t work if we had to output a DAG, where words may have multiple heads.
|
||||
|
||||
p Although #[code.language-python heads] can be represented as an array, we’d actually like to maintain some alternate ways to access the parse, to make it easy and efficient to extract features. Our #[code.language-python Parse] class looks like this:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| class Parse(object):
|
||||
| def __init__(self, n):
|
||||
| self.n = n
|
||||
| self.heads = [None] * (n-1)
|
||||
| self.lefts = []
|
||||
| self.rights = []
|
||||
| for i in range(n+1):
|
||||
| self.lefts.append(DefaultList(0))
|
||||
| self.rights.append(DefaultList(0))
|
||||
|
|
||||
| def add_arc(self, head, child):
|
||||
| self.heads[child] = head
|
||||
| if child < head:
|
||||
| self.lefts[head].append(child)
|
||||
| else:
|
||||
| self.rights[head].append(child)
|
||||
|
||||
p As well as the parse, we also have to keep track of where we’re up to in the sentence. We’ll do this with an index into the #[code.language-python words] array, and a stack, to which we’ll push words, before popping them once their head is set. So our state data structure is fundamentally:
|
||||
|
||||
ul
|
||||
li An index, i, into the list of tokens;
|
||||
li The dependencies added so far, in Parse
|
||||
li A stack, containing words that occurred before i, for which we’re yet to assign a head.
|
||||
|
||||
p Each step of the parsing process applies one of three actions to the state:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| SHIFT = 0; RIGHT = 1; LEFT = 2
|
||||
| MOVES = [SHIFT, RIGHT, LEFT]
|
||||
|
|
||||
| def transition(move, i, stack, parse):
|
||||
| global SHIFT, RIGHT, LEFT
|
||||
| if move == SHIFT:
|
||||
| stack.append(i)
|
||||
| return i + 1
|
||||
| elif move == RIGHT:
|
||||
| parse.add_arc(stack[-2], stack.pop())
|
||||
| return i
|
||||
| elif move == LEFT:
|
||||
| parse.add_arc(i, stack.pop())
|
||||
| return i
|
||||
| raise GrammarError("Unknown move: %d" % move)
|
||||
|
||||
p The #[code.language-python LEFT] and #[code.language-python RIGHT] actions add dependencies and pop the stack, while #[code.language-python SHIFT] pushes the stack and advances i into the buffer.
|
||||
|
||||
p So, the parser starts with an empty stack, and a buffer index at 0, with no dependencies recorded. It chooses one of the (valid) actions, and applies it to the state. It continues choosing actions and applying them until the stack is empty and the buffer index is at the end of the input. (It’s hard to understand this sort of algorithm without stepping through it. Try coming up with a sentence, drawing a projective parse tree over it, and then try to reach the parse tree by choosing the right sequence of transitions.)
|
||||
|
||||
p Here’s what the parsing loop looks like in code:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| class Parser(object):
|
||||
| ...
|
||||
| def parse(self, words):
|
||||
| tags = self.tagger(words)
|
||||
| n = len(words)
|
||||
| idx = 1
|
||||
| stack = [0]
|
||||
| deps = Parse(n)
|
||||
| while stack or idx < n:
|
||||
| features = extract_features(words, tags, idx, n, stack, deps)
|
||||
| scores = self.model.score(features)
|
||||
| valid_moves = get_valid_moves(i, n, len(stack))
|
||||
| next_move = max(valid_moves, key=lambda move: scores[move])
|
||||
| idx = transition(next_move, idx, stack, parse)
|
||||
| return tags, parse
|
||||
|
|
||||
| def get_valid_moves(i, n, stack_depth):
|
||||
| moves = []
|
||||
| if i < n:
|
||||
| moves.append(SHIFT)
|
||||
| if stack_depth <= 2:
|
||||
| moves.append(RIGHT)
|
||||
| if stack_depth <= 1:
|
||||
| moves.append(LEFT)
|
||||
| return moves
|
||||
|
||||
p We start by tagging the sentence, and initializing the state. We then map the state to a set of features, which we score using a linear model. We then find the best-scoring valid move, and apply it to the state.
|
||||
|
||||
p The model scoring works the same as it did in #[a(href="#") the POS tagger]. If you’re confused about the idea of extracting features and scoring them with a linear model, you should review that post. Here’s a reminder of how the model scoring works:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| class Perceptron(object)
|
||||
| ...
|
||||
| def score(self, features):
|
||||
| all_weights = self.weights
|
||||
| scores = dict((clas, 0) for clas in self.classes)
|
||||
| for feat, value in features.items():
|
||||
| if value == 0:
|
||||
| continue
|
||||
| if feat not in all_weights:
|
||||
| continue
|
||||
| weights = all_weights[feat]
|
||||
| for clas, weight in weights.items():
|
||||
| scores[clas] += value * weight
|
||||
| return scores
|
||||
|
||||
p It’s just summing the class-weights for each feature. This is often expressed as a dot-product, but when you’re dealing with multiple classes, that gets awkward, I find.
|
||||
|
||||
p The beam parser (RedShift) tracks multiple candidates, and only decides on the best one at the very end. We’re going to trade away accuracy in favour of efficiency and simplicity. We’ll only follow a single analysis. Our search strategy will be entirely greedy, as it was with the POS tagger. We’ll lock-in our choices at every step.
|
||||
|
||||
p If you read the POS tagger post carefully, you might see the underlying similarity. What we’ve done is mapped the parsing problem onto a sequence-labelling problem, which we address using a “flat”, or unstructured, learning algorithm (by doing greedy search).
|
||||
|
||||
h3 Features
|
||||
|
||||
p Feature extraction code is always pretty ugly. The features for the parser refer to a few tokens from the context:
|
||||
|
||||
ul
|
||||
li The first three words of the buffer (n0, n1, n2)
|
||||
li The top three words of the stack (s0, s1, s2)
|
||||
li The two leftmost children of s0 (s0b1, s0b2);
|
||||
li The two rightmost children of s0 (s0f1, s0f2);
|
||||
li The two leftmost children of n0 (n0b1, n0b2)
|
||||
|
||||
p For these 12 tokens, we refer to the word-form, the part-of-speech tag, and the number of left and right children attached to the token.
|
||||
|
||||
p Because we’re using a linear model, we have our features refer to pairs and triples of these atomic properties.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def extract_features(words, tags, n0, n, stack, parse):
|
||||
| def get_stack_context(depth, stack, data):
|
||||
| if depth >= 3:
|
||||
| return data[stack[-1]], data[stack[-2]], data[stack[-3]]
|
||||
| elif depth >= 2:
|
||||
| return data[stack[-1]], data[stack[-2]], ''
|
||||
| elif depth == 1:
|
||||
| return data[stack[-1]], '', ''
|
||||
| else:
|
||||
| return '', '', ''
|
||||
|
|
||||
| def get_buffer_context(i, n, data):
|
||||
| if i + 1 >= n:
|
||||
| return data[i], '', ''
|
||||
| elif i + 2 >= n:
|
||||
| return data[i], data[i + 1], ''
|
||||
| else:
|
||||
| return data[i], data[i + 1], data[i + 2]
|
||||
|
|
||||
| def get_parse_context(word, deps, data):
|
||||
| if word == -1:
|
||||
| return 0, '', ''
|
||||
| deps = deps[word]
|
||||
| valency = len(deps)
|
||||
| if not valency:
|
||||
| return 0, '', ''
|
||||
| elif valency == 1:
|
||||
| return 1, data[deps[-1]], ''
|
||||
| else:
|
||||
| return valency, data[deps[-1]], data[deps[-2]]
|
||||
|
|
||||
| features = {}
|
||||
| # Set up the context pieces --- the word, W, and tag, T, of:
|
||||
| # S0-2: Top three words on the stack
|
||||
| # N0-2: First three words of the buffer
|
||||
| # n0b1, n0b2: Two leftmost children of the first word of the buffer
|
||||
| # s0b1, s0b2: Two leftmost children of the top word of the stack
|
||||
| # s0f1, s0f2: Two rightmost children of the top word of the stack
|
||||
|
|
||||
| depth = len(stack)
|
||||
| s0 = stack[-1] if depth else -1
|
||||
|
|
||||
| Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words)
|
||||
| Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags)
|
||||
|
|
||||
| Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words)
|
||||
| Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags)
|
||||
|
|
||||
| Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words)
|
||||
| Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags)
|
||||
|
|
||||
| Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words)
|
||||
| _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags)
|
||||
|
|
||||
| Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words)
|
||||
| _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags)
|
||||
|
|
||||
| Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words)
|
||||
| _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags)
|
||||
|
|
||||
| # Cap numeric features at 5?
|
||||
| # String-distance
|
||||
| Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0
|
||||
|
|
||||
| features['bias'] = 1
|
||||
| # Add word and tag unigrams
|
||||
| for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2):
|
||||
| if w:
|
||||
| features['w=%s' % w] = 1
|
||||
| for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2):
|
||||
| if t:
|
||||
| features['t=%s' % t] = 1
|
||||
|
|
||||
| # Add word/tag pairs
|
||||
| for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))):
|
||||
| if w or t:
|
||||
| features['%d w=%s, t=%s' % (i, w, t)] = 1
|
||||
|
|
||||
| # Add some bigrams
|
||||
| features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1
|
||||
| features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1
|
||||
| features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1
|
||||
| features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1
|
||||
| features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1
|
||||
| features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1
|
||||
| features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1
|
||||
| features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1
|
||||
|
|
||||
| # Add some tag trigrams
|
||||
| trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0),
|
||||
| (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1),
|
||||
| (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2),
|
||||
| (Ts0, Ts1, Ts1))
|
||||
| for i, (t1, t2, t3) in enumerate(trigrams):
|
||||
| if t1 or t2 or t3:
|
||||
| features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1
|
||||
|
|
||||
| # Add some valency and distance features
|
||||
| vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b))
|
||||
| vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b))
|
||||
| d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0),
|
||||
| ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0))
|
||||
| for i, (w_t, v_d) in enumerate(vw + vt + d):
|
||||
| if w_t or v_d:
|
||||
| features['val/d-%d %s %d' % (i, w_t, v_d)] = 1
|
||||
| return features
|
||||
|
||||
|
||||
h3 Training
|
||||
|
||||
p Weights are learned using the same algorithm, averaged perceptron, that we used for part-of-speech tagging. Its key strength is that it’s an online learning algorithm: examples stream in one-by-one, we make our prediction, check the actual answer, and adjust our beliefs (weights) if we were wrong.
|
||||
|
||||
p The training loop looks like this:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| class Parser(object):
|
||||
| ...
|
||||
| def train_one(self, itn, words, gold_tags, gold_heads):
|
||||
| n = len(words)
|
||||
| i = 2; stack = [1]; parse = Parse(n)
|
||||
| tags = self.tagger.tag(words)
|
||||
| while stack or (i + 1) < n:
|
||||
| features = extract_features(words, tags, i, n, stack, parse)
|
||||
| scores = self.model.score(features)
|
||||
| valid_moves = get_valid_moves(i, n, len(stack))
|
||||
| guess = max(valid_moves, key=lambda move: scores[move])
|
||||
| gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads)
|
||||
| best = max(gold_moves, key=lambda move: scores[move])
|
||||
| self.model.update(best, guess, features)
|
||||
| i = transition(guess, i, stack, parse)
|
||||
| # Return number correct
|
||||
| return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]])
|
||||
|
||||
p The most interesting part of the training process is in #[code.language-python get_gold_moves]. The performance of our parser is made possible by an advance by Goldberg and Nivre (2012), who showed that we’d been doing this wrong for years.
|
||||
|
||||
|
||||
p.box.infobox #[strong.note 2015-08-19 Update:] Interestingly, CoreNLP continues to "do it wrong" – their transition-based parser uses the static-oracle, rather than the dynamic oracle described here. I attribute spaCy's accuracy advantage to this difference in training algorithm. The ClearNLP parser uses an iterative algorithm that achieves the same sort of thing (and was published prior to the dynamic oracle). I find the dynamic oracle idea much more conceptually clear.
|
||||
|
||||
p In the POS-tagging post, I cautioned that during training you need to make sure you pass in the last two #[em predicted] tags as features for the current tag, not the last two #[em gold] tags. At test time you’ll only have the predicted tags, so if you base your features on the gold sequence during training, your training contexts won’t resemble your test-time contexts, so you’ll learn the wrong weights.
|
||||
|
||||
p In parsing, the problem was that we didn’t know #[em how] to pass in the predicted sequence! Training worked by taking the gold-standard tree, and finding a transition sequence that led to it. i.e., you got back a sequence of moves, with the guarantee that if you followed those moves, you’d get the gold-standard dependencies.
|
||||
|
||||
p The problem is, we didn’t know how to define the “correct” move to teach a parser to make if it was in any state that #[em wasn’t] along that gold-standard sequence. Once the parser had made a mistake, we didn’t know how to train from that example.
|
||||
|
||||
p That was a big problem, because it meant that once the parser started making mistakes, it would end up in states unlike any in its training data – leading to yet more mistakes. The problem was specific to greedy parsers: once you use a beam, there’s a natural way to do structured prediction.
|
||||
|
||||
p.box.infobox #[strong.note 2015-08-19 Update:] It's since been pointed out to me that what we're calling a "dynamic oracle" here is really a form of #[a(href="http://www.ausy.tu-darmstadt.de/Research/ICML2011") imitation learning].
|
||||
|
||||
|
||||
p The solution seems obvious once you know it, like all the best breakthroughs. What we do is define a function that asks “How many gold-standard dependencies can be recovered from this state?”. If you can define that function, then you can apply each move in turn, and ask, “How many gold-standard dependencies can be recovered from #[em this] state?”. If the action you applied allows #[em fewer] gold-standard dependencies to be reached, then it is sub-optimal.
|
||||
|
||||
p That’s a lot to take in.
|
||||
|
||||
p So we have this function #[code Oracle(state)]:
|
||||
pre
|
||||
code
|
||||
| Oracle(state) = | gold_arcs ∩ reachable_arcs(state) |
|
||||
|
||||
p We also have a set of actions, each of which returns a new state. We want to know:
|
||||
|
||||
ul
|
||||
li #[code shift_cost = Oracle(state) – Oracle(shift(state))]
|
||||
li #[code right_cost = Oracle(state) – Oracle(right(state))]
|
||||
li #[code left_cost = Oracle(state) – Oracle(left(state))]
|
||||
|
||||
p Now, at least one of those costs #[em has] to be zero. Oracle(state) is asking, “what’s the cost of the best path forward?”, and the first action of that best path has to be shift, right, or left.
|
||||
|
||||
p It turns out that we can derive Oracle fairly simply for many transition systems. The derivation for the transition system we’re using, Arc Hybrid, is in Goldberg and Nivre (2013).
|
||||
|
||||
p We’re going to implement the oracle as a function that returns the zero-cost moves, rather than implementing a function Oracle(state). This prevents us from doing a bunch of costly copy operations. Hopefully the reasoning in the code isn’t too hard to follow, but you can also consult Goldberg and Nivre’s papers if you’re confused and want to get to the bottom of this.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def get_gold_moves(n0, n, stack, heads, gold):
|
||||
| def deps_between(target, others, gold):
|
||||
| for word in others:
|
||||
| if gold[word] == target or gold[target] == word:
|
||||
| return True
|
||||
| return False
|
||||
|
|
||||
| valid = get_valid_moves(n0, n, len(stack))
|
||||
| if not stack or (SHIFT in valid and gold[n0] == stack[-1]):
|
||||
| return [SHIFT]
|
||||
| if gold[stack[-1]] == n0:
|
||||
| return [LEFT]
|
||||
| costly = set([m for m in MOVES if m not in valid])
|
||||
| # If the word behind s0 is its gold head, Left is incorrect
|
||||
| if len(stack) >= 2 and gold[stack[-1]] == stack[-2]:
|
||||
| costly.add(LEFT)
|
||||
| # If there are any dependencies between n0 and the stack,
|
||||
| # pushing n0 will lose them.
|
||||
| if SHIFT not in costly and deps_between(n0, stack, gold):
|
||||
| costly.add(SHIFT)
|
||||
| # If there are any dependencies between s0 and the buffer, popping
|
||||
| # s0 will lose them.
|
||||
| if deps_between(stack[-1], range(n0+1, n-1), gold):
|
||||
| costly.add(LEFT)
|
||||
| costly.add(RIGHT)
|
||||
| return [m for m in MOVES if m not in costly]
|
||||
|
||||
p Doing this “dynamic oracle” training procedure makes a big difference to accuracy — typically 1-2%, with no difference to the way the run-time works. The old “static oracle” greedy training procedure is fully obsolete; there’s no reason to do it that way any more.
|
||||
|
||||
h3 Conclusion
|
||||
|
||||
p I have the sense that language technologies, particularly those relating to grammar, are particularly mysterious. I can imagine having no idea what the program might even do.
|
||||
|
||||
p I think it therefore seems natural to people that the best solutions would be over-whelmingly complicated. A 200,000 line Java package feels appropriate.
|
||||
|
||||
p But, algorithmic code is usually short, when only a single algorithm is implemented. And when you only implement one algorithm, and you know exactly what you want to write before you write a line, you also don’t pay for any unnecessary abstractions, which can have a big performance impact.
|
||||
|
||||
h3 Notes
|
||||
|
||||
p #[a(name='note-1')] [1] I wasn’t really sure how to count the lines of code in the Stanford parser. Its jar file ships over 200k, but there are a lot of different models in it. It’s not important, but it's certainly over 4k.
|
||||
|
||||
p #[a(name='note-2')] [2] For instance, how would you parse, “John’s school of music calls”? You want to make sure the phrase “John’s school” has a consistent structure in both “John’s school calls” and “John’s school of music calls”. Reasoning about the different “slots” you can put a phrase into is a key way we reason about what syntactic analyses look like. You can think of each phrase as having a different shaped connector, which you need to plug into different slots — which each phrase also has a certain number of, each of a different shape. We’re trying to figure out what connectors are where, so we can figure out how the sentences are put together.
|
||||
|
||||
h3 Idle speculation
|
||||
p For a long time, incremental language processing algorithms were primarily of scientific interest. If you want to write a parser to test a theory about how the human sentence processor might work, well, that parser needs to build partial interpretations. There’s a wealth of evidence, including commonsense introspection, that establishes that we don’t buffer input and analyse it once the speaker has finished.
|
||||
|
||||
p But now algorithms with that neat scientific feature are winning! As best as I can tell, the secret to that success is to be:
|
||||
|
||||
ul
|
||||
li Incremental. Earlier words constrain the search.
|
||||
li Error-driven. Training involves a working hypothesis, which is updated as it makes mistakes.
|
||||
|
||||
p The links to human sentence processing seem tantalising. I look forward to seeing whether these engineering breakthroughs lead to any psycholinguistic advances.
|
||||
|
||||
h3 Bibliography
|
||||
|
||||
p The NLP literature is almost entirely open access. All of the relavant papers can be found #[a(href="http://aclweb.org/anthology/", rel="nofollow") here].
|
||||
p The parser I’ve described is an implementation of the dynamic-oracle Arc-Hybrid system here:
|
||||
|
||||
span.bib-item Goldberg, Yoav; Nivre, Joakim. #[em Training Deterministic Parsers with Non-Deterministic Oracles]. TACL 2013
|
||||
p However, I wrote my own features for it. The arc-hybrid system was originally described here:
|
||||
|
||||
span.bib-item Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic programming algorithms for transition-based dependency parsers. ACL 2011
|
||||
|
||||
p The dynamic oracle training method was first described here:
|
||||
|
||||
span.bib-item A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav; Nivre, Joakim. COLING 2012
|
||||
|
||||
p This work depended on a big break-through in accuracy for transition-based parsers, when beam-search was properly explored by Zhang and Clark. They have several papers, but the preferred citation is:
|
||||
|
||||
span.bib-item Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized Perceptron and Beam Search. Computational Linguistics 2011 (1)
|
||||
|
||||
p Another important paper was this little feature engineering paper, which further improved the accuracy:
|
||||
|
||||
span.bib-item Zhang, Yue; Nivre, Joakim. Transition-based Dependency Parsing with Rich Non-local Features. ACL 2011
|
||||
|
||||
p The generalised perceptron, which is the learning framework for these beam parsers, is from this paper:
|
||||
|
||||
span.bib-item Collins, Michael. Discriminative Training Methods for Hidden Markov Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002
|
||||
|
||||
h3 Experimental details
|
||||
|
||||
p The results at the start of the post refer to Section 22 of the Wall Street Journal corpus. The Stanford parser was run as follows:
|
||||
|
||||
pre.language-bash
|
||||
code
|
||||
| java -mx10000m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \
|
||||
| -outputFormat "penn" edu/stanford/nlp/models/lexparser/englishFactored.ser.gz $*
|
||||
|
||||
p A small post-process was applied, to undo the fancy tokenisation Stanford adds for numbers, to make them match the PTB tokenisation:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| """Stanford parser retokenises numbers. Split them."""
|
||||
| import sys
|
||||
| import re
|
||||
|
|
||||
| qp_re = re.compile('\xc2\xa0')
|
||||
| for line in sys.stdin:
|
||||
| line = line.rstrip()
|
||||
| if qp_re.search(line):
|
||||
| line = line.replace('(CD', '(QP (CD', 1) + ')'
|
||||
| line = line.replace('\xc2\xa0', ') (CD ')
|
||||
| print line
|
||||
|
||||
p The resulting PTB-format files were then converted into dependencies using the Stanford converter:
|
||||
|
||||
pre.language-bash
|
||||
code
|
||||
| for f in $1/*.mrg; do
|
||||
| echo $f
|
||||
| grep -v CODE $f > "$f.2"
|
||||
| out="$f.dep"
|
||||
| java -mx800m -cp "$scriptdir/*:" edu.stanford.nlp.trees.EnglishGrammaticalStructure \
|
||||
| -treeFile "$f.2" -basic -makeCopulaHead -conllx > $out
|
||||
| done
|
||||
|
||||
p I can’t easily read that anymore, but it should just convert every .mrg file in a folder to a CoNLL-format Stanford basic dependencies file, using the settings common in the dependency literature.
|
||||
|
||||
p I then converted the gold-standard trees from WSJ 22, for the evaluation. Accuracy scores refer to unlabelled attachment score (i.e. the head index) of all non-punctuation tokens.
|
||||
|
||||
p To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21 into the same conversion script.
|
||||
|
||||
p In a nutshell: The Stanford model and parser.py are trained on the same set of sentences, and they each make their predictions on a held-out test set, for which we know the answers. Accuracy refers to how many of the words’ heads we got correct.
|
||||
|
||||
p Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a server, to give the Stanford parser more memory. The parser.py system runs fine on my MacBook Air. I used PyPy for the parser.py experiments; CPython was about half as fast on an early benchmark.
|
||||
|
||||
p One of the reasons parser.py is so fast is that it does unlabelled parsing. Based on previous experiments, a labelled parser would likely be about 40x slower, and about 1% more accurate. Adapting the program to labelled parsing would be a good exercise for the reader, if you have access to the data.
|
||||
|
||||
p The result from the Redshift parser was produced from commit #[code.language-python b6b624c9900f3bf], which was run as follows:
|
||||
|
||||
pre.language-bash
|
||||
code
|
||||
| ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp
|
||||
| ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/
|
||||
| ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll
|
15
website/src/jade/blog/parsing-english-in-python/meta.jade
Normal file
15
website/src/jade/blog/parsing-english-in-python/meta.jade
Normal file
|
@ -0,0 +1,15 @@
|
|||
- var Meta = {}
|
||||
- Meta.author_id = "matt"
|
||||
- Meta.headline = "Parsing English in 500 lines of Python"
|
||||
- Meta.description = "This post explains how transition-based dependency parsers work, and argues that this algorithm represents a break-through in natural language understanding. A concise sample implementation is provided, in 500 lines of Python, with no external dependencies. This post was written in 2013. In 2015 this type of parser is now increasingly dominant."
|
||||
- Meta.date = "2013-12-18"
|
||||
- Meta.url = "/blog/parsing-english-in-python"
|
||||
- Meta.links = [{}, {}]
|
||||
- Meta.links[0].id = 'reddit'
|
||||
- Meta.links[0].name = "Reddit"
|
||||
- Meta.links[0].title = 'Reddit Thread'
|
||||
- Meta.links[0].url = "https://www.reddit.com/r/programming/comments/245jte/parsing_english_with_500_lines_of_python/"
|
||||
- Meta.links[1].id = 'hn'
|
||||
- Meta.links[1].name = "Hacker News Thread"
|
||||
- Meta.links[1].title = 'Hacker News'
|
||||
- Meta.links[1].url = "https://news.ycombinator.com/item?id=7658864"
|
|
@ -0,0 +1,288 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
+WritePost(Meta)
|
||||
|
||||
p Up-to-date knowledge about natural language processing is mostly locked away in academia. And academics are mostly pretty self-conscious when we write. We’re careful. We don’t want to stick our necks out too much. But under-confident recommendations suck, so here’s how to write a good part-of-speech tagger.
|
||||
|
||||
p There are a tonne of “best known techniques” for POS tagging, and you should ignore the others and just use Averaged Perceptron.
|
||||
|
||||
p You should use two tags of history, and features derived from the Brown word clusters distributed here.
|
||||
|
||||
p If you only need the tagger to work on carefully edited text, you should use case-sensitive features, but if you want a more robust tagger you should avoid them because they’ll make you over-fit to the conventions of your training domain. Instead, features that ask “how frequently is this word title-cased, in a large sample from the web?” work well. Then you can lower-case your comparatively tiny training corpus.
|
||||
|
||||
p For efficiency, you should figure out which frequent words in your training data have unambiguous tags, so you don’t have to do anything but output their tags when they come up. About 50% of the words can be tagged that way.
|
||||
|
||||
p And unless you really, really can’t do without an extra 0.1% of accuracy, you probably shouldn’t bother with any kind of search strategy you should just use a greedy model.
|
||||
|
||||
p If you do all that, you’ll find your tagger easy to write and understand, and an efficient Cython implementation will perform as follows on the standard evaluation, 130,000 words of text from the Wall Street Journal:
|
||||
|
||||
table
|
||||
thead
|
||||
tr
|
||||
th Tagger
|
||||
th Accuracy
|
||||
th Time (130k words)
|
||||
tbody
|
||||
tr
|
||||
td CyGreedyAP
|
||||
td 97.1%
|
||||
td 4s
|
||||
|
||||
p The 4s includes initialisation time — the actual per-token speed is high enough to be irrelevant; it won’t be your bottleneck.
|
||||
|
||||
p It’s tempting to look at 97% accuracy and say something similar, but that’s not true. My parser is about 1% more accurate if the input has hand-labelled POS tags, and the taggers all perform much worse on out-of-domain data. Unfortunately accuracies have been fairly flat for the last ten years. That’s why my recommendation is to just use a simple and fast tagger that’s roughly as good.
|
||||
|
||||
p The thing is though, it’s very common to see people using taggers that aren’t anywhere near that good! For an example of what a non-expert is likely to use, these were the two taggers wrapped by TextBlob, a new Python api that I think is quite neat:
|
||||
|
||||
table
|
||||
thead
|
||||
tr
|
||||
th Tagger
|
||||
th Accuracy
|
||||
th Time (130k words)
|
||||
tbody
|
||||
tr
|
||||
td NLTK
|
||||
td 94.0%
|
||||
td 3m56s
|
||||
tr
|
||||
td Pattern
|
||||
td 93.5%
|
||||
td 26s
|
||||
|
||||
p Both Pattern and NLTK are very robust and beautifully well documented, so the appeal of using them is obvious. But Pattern’s algorithms are pretty crappy, and NLTK carries tremendous baggage around in its implementation because of its massive framework, and double-duty as a teaching tool.
|
||||
|
||||
p As a stand-alone tagger, my Cython implementation is needlessly complicated – it was written for my parser. So today I wrote a 200 line version of my recommended algorithm for TextBlob. It gets:
|
||||
|
||||
table
|
||||
thead
|
||||
tr
|
||||
th Tagger
|
||||
th Accuracy
|
||||
th Time (130k words)
|
||||
tbody
|
||||
tr
|
||||
td PyGreedyAP
|
||||
td 96.8%
|
||||
td 12s
|
||||
|
||||
p I traded some accuracy and a lot of efficiency to keep the implementation simple. Here’s a far-too-brief description of how it works.
|
||||
|
||||
h3 Averaged perceptron
|
||||
|
||||
p POS tagging is a “supervised learning problem”. You’re given a table of data, and you’re told that the values in the last column will be missing during run-time. You have to find correlations from the other columns to predict that value.
|
||||
|
||||
p So for us, the missing column will be “part of speech at word i“. The predictor columns (features) will be things like “part of speech at word i-1“, “last three letters of word at i+1“, etc
|
||||
|
||||
p First, here’s what prediction looks like at run-time:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def predict(self, features):
|
||||
| '''Dot-product the features and current weights and return the best class.'''
|
||||
| scores = defaultdict(float)
|
||||
| for feat in features:
|
||||
| if feat not in self.weights:
|
||||
| continue
|
||||
| weights = self.weights[feat]
|
||||
| for clas, weight in weights.items():
|
||||
| scores[clas] += weight
|
||||
| # Do a secondary alphabetic sort, for stability
|
||||
| return max(self.classes, key=lambda clas: (scores[clas], clas))
|
||||
|
||||
p Earlier I described the learning problem as a table, with one of the columns marked as missing-at-runtime. For NLP, our tables are always exceedingly sparse. You have columns like “word i-1=Parliament”, which is almost always 0. So our “weight vectors” can pretty much never be implemented as vectors. Map-types are good though — here we use dictionaries.
|
||||
|
||||
p The input data, features, is a set with a member for every non-zero “column” in our “table” – every active feature. Usually this is actually a dictionary, to let you set values for the features. But here all my features are binary present-or-absent type deals.
|
||||
|
||||
p The weights data-structure is a dictionary of dictionaries, that ultimately associates feature/class pairs with some weight. You want to structure it this way instead of the reverse because of the way word frequencies are distributed: most words are rare, frequent words are very frequent.
|
||||
|
||||
h3 Learning the weights
|
||||
|
||||
p Okay, so how do we get the values for the weights? We start with an empty weights dictionary, and iteratively do the following:
|
||||
|
||||
ol
|
||||
li Receive a new (features, POS-tag) pair
|
||||
li Guess the value of the POS tag given the current “weights” for the features
|
||||
li If guess is wrong, add +1 to the weights associated with the correct class for these features, and -1 to the weights for the predicted class.
|
||||
|
||||
p It’s one of the simplest learning algorithms. Whenever you make a mistake, increment the weights for the correct class, and penalise the weights that led to your false prediction. In code:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def train(self, nr_iter, examples):
|
||||
| for i in range(nr_iter):
|
||||
| for features, true_tag in examples:
|
||||
| guess = self.predict(features)
|
||||
| if guess != true_tag:
|
||||
| for f in features:
|
||||
| self.weights[f][true_tag] += 1
|
||||
| self.weights[f][guess] -= 1
|
||||
| random.shuffle(examples)
|
||||
|
||||
p If you iterate over the same example this way, the weights for the correct class would have to come out ahead, and you’d get the example right. If you think about what happens with two examples, you should be able to see that it will get them both right unless the features are identical. In general the algorithm will converge so long as the examples are linearly separable, although that doesn’t matter for our purpose.
|
||||
|
||||
h3 Averaging the weights
|
||||
|
||||
p We need to do one more thing to make the perceptron algorithm competitive. The problem with the algorithm so far is that if you train it twice on slightly different sets of examples, you end up with really different models. It doesn’t generalise that smartly. And the problem is really in the later iterations — if you let it run to convergence, it’ll pay lots of attention to the few examples it’s getting wrong, and mutate its whole model around them.
|
||||
|
||||
p So, what we’re going to do is make the weights more "sticky" – give the model less chance to ruin all its hard work in the later rounds. And we’re going to do that by returning the averaged weights, not the final weights.
|
||||
|
||||
p I doubt there are many people who are convinced that’s the most obvious solution to the problem, but whatever. We’re not here to innovate, and this way is time tested on lots of problems. If you have another idea, run the experiments and tell us what you find. Actually I’d love to see more work on this, now that the averaged perceptron has become such a prominent learning algorithm in NLP.
|
||||
|
||||
p Okay. So this averaging. How’s that going to work? Note that we don’t want to just average after each outer-loop iteration. We want the average of all the values — from the inner loop. So if we have 5,000 examples, and we train for 10 iterations, we’ll average across 50,000 values for each weight.
|
||||
|
||||
p Obviously we’re not going to store all those intermediate values. Instead, we’ll track an accumulator for each weight, and divide it by the number of iterations at the end. Again: we want the average weight assigned to a feature/class pair during learning, so the key component we need is the total weight it was assigned. But we also want to be careful about how we compute that accumulator, too. On almost any instance, we’re going to see a tiny fraction of active feature/class pairs. All the other feature/class weights won’t change. So we shouldn’t have to go back and add the unchanged value to our accumulators anyway, like chumps.
|
||||
|
||||
p Since we’re not chumps, we’ll make the obvious improvement. We’ll maintain another dictionary that tracks how long each weight has gone unchanged. Now when we do change a weight, we can do a fast-forwarded update to the accumulator, for all those iterations where it lay unchanged.
|
||||
|
||||
p Here’s what a weight update looks like now that we have to maintain the totals and the time-stamps:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def update(self, truth, guess, features):
|
||||
| def upd_feat(c, f, v):
|
||||
| nr_iters_at_this_weight = self.i - self._timestamps[f][c]
|
||||
| self._totals[f][c] += nr_iters_at_this_weight * self.weights[f][c]
|
||||
| self.weights[f][c] += v
|
||||
| self._timestamps[f][c] = self.i
|
||||
|
||||
| self.i += 1
|
||||
| for f in features:
|
||||
| upd_feat(truth, f, 1.0)
|
||||
| upd_feat(guess, f, -1.0)
|
||||
|
||||
h3 Features and pre-processing
|
||||
|
||||
p The POS tagging literature has tonnes of intricate features sensitive to case, punctuation, etc. They help on the standard test-set, which is from Wall Street Journal articles from the 1980s, but I don’t see how they’ll help us learn models that are useful on other text.
|
||||
|
||||
p To help us learn a more general model, we’ll pre-process the data prior to feature extraction, as follows:
|
||||
|
||||
ul
|
||||
li All words are lower cased;
|
||||
li Digits in the range 1800-2100 are represented as !YEAR;
|
||||
li Other digit strings are represented as !DIGITS
|
||||
li It would be better to have a module recognising dates, phone numbers, emails, hash-tags, etc. but that will have to be pushed back into the tokenization.
|
||||
|
||||
p I played around with the features a little, and this seems to be a reasonable bang-for-buck configuration in terms of getting the development-data accuracy to 97% (where it typically converges anyway), and having a smaller memory foot-print:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def _get_features(self, i, word, context, prev, prev2):
|
||||
| '''Map tokens-in-contexts into a feature representation, implemented as a
|
||||
| set. If the features change, a new model must be trained.'''
|
||||
| def add(name, *args):
|
||||
| features.add('+'.join((name,) + tuple(args)))
|
||||
|
||||
| features = set()
|
||||
| add('bias') # This acts sort of like a prior
|
||||
| add('i suffix', word[-3:])
|
||||
| add('i pref1', word[0])
|
||||
| add('i-1 tag', prev)
|
||||
| add('i-2 tag', prev2)
|
||||
| add('i tag+i-2 tag', prev, prev2)
|
||||
| add('i word', context[i])
|
||||
| add('i-1 tag+i word', prev, context[i])
|
||||
| add('i-1 word', context[i-1])
|
||||
| add('i-1 suffix', context[i-1][-3:])
|
||||
| add('i-2 word', context[i-2])
|
||||
| add('i+1 word', context[i+1])
|
||||
| add('i+1 suffix', context[i+1][-3:])
|
||||
| add('i+2 word', context[i+2])
|
||||
| return features
|
||||
|
||||
p I haven’t added any features from external data, such as case frequency statistics from the Google Web 1T corpus. I might add those later, but for now I figured I’d keep things simple.
|
||||
|
||||
h3 What about search?
|
||||
|
||||
p The model I’ve recommended commits to its predictions on each word, and moves on to the next one. Those predictions are then used as features for the next word. There’s a potential problem here, but it turns out it doesn’t matter much. It’s easy to fix with beam-search, but I say it’s not really worth bothering. And it definitely doesn’t matter enough to adopt a slow and complicated algorithm like Conditional Random Fields.
|
||||
|
||||
p Here’s the problem. The best indicator for the tag at position, say, 3 in a sentence is the word at position 3. But the next-best indicators are the tags at positions 2 and 4. So there’s a chicken-and-egg problem: we want the predictions for the surrounding words in hand before we commit to a prediction for the current word. Here’s an example where search might matter:
|
||||
|
||||
p.example Their management plan reforms worked
|
||||
|
||||
p Depending on just what you’ve learned from your training data, you can imagine making a different decision if you started at the left and moved right, conditioning on your previous decisions, than if you’d started at the right and moved left.
|
||||
|
||||
p If that’s not obvious to you, think about it this way: “worked” is almost surely a verb, so if you tag “reforms” with that in hand, you’ll have a different idea of its tag than if you’d just come from “plan“, which you might have regarded as either a noun or a verb.
|
||||
|
||||
p Search can only help you when you make a mistake. It can prevent that error from throwing off your subsequent decisions, or sometimes your future choices will correct the mistake. And that’s why for POS tagging, search hardly matters! Your model is so good straight-up that your past predictions are almost always true. So you really need the planets to align for search to matter at all.
|
||||
|
||||
p And as we improve our taggers, search will matter less and less. Instead of search, what we should be caring about is multi-tagging. If we let the model be a bit uncertain, we can get over 99% accuracy assigning an average of 1.05 tags per word (Vadas et al, ACL 2006). The averaged perceptron is rubbish at multi-tagging though. That’s its big weakness. You really want a probability distribution for that.
|
||||
|
||||
p One caveat when doing greedy search, though. It’s very important that your training data model the fact that the history will be imperfect at run-time. Otherwise, it will be way over-reliant on the tag-history features. Because the Perceptron is iterative, this is very easy.
|
||||
|
||||
p Here’s the training loop for the tagger:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def train(self, sentences, save_loc=None, nr_iter=5, quiet=False):
|
||||
| '''Train a model from sentences, and save it at save_loc. nr_iter
|
||||
| controls the number of Perceptron training iterations.'''
|
||||
| self._make_tagdict(sentences, quiet=quiet)
|
||||
| self.model.classes = self.classes
|
||||
| prev, prev2 = START
|
||||
| for iter_ in range(nr_iter):
|
||||
| c = 0; n = 0
|
||||
| for words, tags in sentences:
|
||||
| context = START + [self._normalize(w) for w in words] + END
|
||||
| for i, word in enumerate(words):
|
||||
| guess = self.tagdict.get(word)
|
||||
| if not guess:
|
||||
| feats = self._get_features(
|
||||
| i, word, context, prev, prev2)
|
||||
| guess = self.model.predict(feats)
|
||||
| self.model.update(tags[i], guess, feats)
|
||||
| # Set the history features from the guesses, not the
|
||||
| # true tags
|
||||
| prev2 = prev; prev = guess
|
||||
| c += guess == tags[i]; n += 1
|
||||
| random.shuffle(sentences)
|
||||
| if not quiet:
|
||||
| print("Iter %d: %d/%d=%.3f" % (iter_, c, n, _pc(c, n)))
|
||||
| self.model.average_weights()
|
||||
| # Pickle as a binary file
|
||||
| if save_loc is not None:
|
||||
| cPickle.dump((self.model.weights, self.tagdict, self.classes),
|
||||
| open(save_loc, 'wb'), -1)
|
||||
|
||||
p Unlike the previous snippets, this one’s literal – I tended to edit the previous ones to simplify. So if they have bugs, hopefully that’s why!
|
||||
|
||||
p At the time of writing, I’m just finishing up the implementation before I submit a pull request to TextBlob. You can see the rest of the source here:
|
||||
|
||||
ul
|
||||
li #[a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/taggers.py") taggers.py]
|
||||
li #[a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/_perceptron.py") perceptron.py]
|
||||
|
||||
h3 A final comparison…
|
||||
|
||||
p Over the years I’ve seen a lot of cynicism about the WSJ evaluation methodology. The claim is that we’ve just been meticulously over-fitting our methods to this data. Actually the evidence doesn’t really bear this out. Mostly, if a technique is clearly better on one evaluation, it improves others as well. Still, it’s very reasonable to want to know how these tools perform on other text. So I ran the unchanged models over two other sections from the OntoNotes corpus:
|
||||
|
||||
table
|
||||
thead
|
||||
tr
|
||||
th Tagger
|
||||
th WSJ
|
||||
th ABC
|
||||
th Web
|
||||
tbody
|
||||
tr
|
||||
td Pattern
|
||||
td 93.5
|
||||
td 90.7
|
||||
td 88.1
|
||||
tr
|
||||
td NLTK
|
||||
td 94.0
|
||||
td 91.5
|
||||
td 88.4
|
||||
tr
|
||||
td PyGreedyAP
|
||||
td 96.8
|
||||
td 94.8
|
||||
td 91.8
|
||||
|
||||
p The ABC section is broadcast news, Web is text from the web (blogs etc — I haven’t looked at the data much).
|
||||
|
||||
p As you can see, the order of the systems is stable across the three comparisons, and the advantage of our Averaged Perceptron tagger over the other two is real enough. Actually the pattern tagger does very poorly on out-of-domain text. It mostly just looks up the words, so it’s very domain dependent. I hadn’t realised it before, but it’s obvious enough now that I think about it.
|
||||
|
||||
p We can improve our score greatly by training on some of the foreign data. The technique described in this paper (Daume III, 2007) is the first thing I try when I have to do that.
|
|
@ -0,0 +1,12 @@
|
|||
- var Meta = {}
|
||||
- Meta.author_id = "matt"
|
||||
- Meta.headline = "A Good Part-of-Speech Tagger in about 200 Lines of Python"
|
||||
- Meta.description = "Up-to-date knowledge about natural language processing is mostly locked away in academia. And academics are mostly pretty self-conscious when we write. We’re careful. We don’t want to stick our necks out too much. But under-confident recommendations suck, so here’s how to write a good part-of-speech tagger."
|
||||
- Meta.lede = "Lorem lede for parser blog post"
|
||||
- Meta.date = "2013-09-18"
|
||||
- Meta.url = "/blog/part-of-speech-POS-tagger-in-python"
|
||||
- Meta.links = [{}]
|
||||
- Meta.links[0].id = 'reddit'
|
||||
- Meta.links[0].name = "Reddit"
|
||||
- Meta.links[0].title = 'Reddit Thread'
|
||||
- Meta.links[0].url = "https://www.reddit.com/r/programming/comments/1mdn75/a_good_partofspeech_tagger_in_200_lines_of_python/"
|
100
website/src/jade/blog/writing-c-in-cython/index.jade
Normal file
100
website/src/jade/blog/writing-c-in-cython/index.jade
Normal file
|
@ -0,0 +1,100 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
+WritePost(Meta)
|
||||
|
||||
p For the last two years, I’ve done almost all of my work in #[a(href="https://en.wikipedia.org/wiki/Cython" target="_blank") Cython]. And I don’t mean, I write Python, and then “Cythonize” it, with various type-declarations etc. I just, write Cython. I use “raw” C structs and arrays, and occasionally C++ vectors, with a thin wrapper around malloc/free that I wrote myself. The code is almost always exactly as fast as C/C++, because it really is just C/C++ with some syntactic sugar — but with Python “right there”, should I need/want it.
|
||||
|
||||
p This is basically the inverse of the old promise that languages like Python came with: that you would write your whole application in Python, optimise the “hot spots” with C, and voila! C speed, Python convenience, and money in the bank.
|
||||
|
||||
p This was always much nicer in theory than practice. In practice, your data structures have a huge influence on both the efficiency of your code, and how annoying it is to write. Arrays are a pain and fast; lists are blissfully convenient, and very slow. Python loops and function calls are also quite slow, so the part you have to write in C tends to wriggle its way up the stack, until it’s almost your whole application.
|
||||
|
||||
p Today a post came up on HN, on #[a(href="https://www.crumpington.com/blog/2014/10-19-high-performance-python-extensions-part-1.html" target="_blank") writing C extensions for Python]. The author wrote both a pure Python implementation, and a C implementation, using the Numpy C API. This seemed a good opportunity to demonstrate the difference, so I wrote a Cython implementation for comparison:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| import random
|
||||
| from cymem.cymem cimport Pool
|
||||
|
|
||||
| from libc.math cimport sqrt
|
||||
|
|
||||
| cimport cython
|
||||
|
|
||||
| cdef struct Point:
|
||||
| double x
|
||||
| double y
|
||||
|
|
||||
| cdef class World:
|
||||
| cdef Pool mem
|
||||
| cdef int N
|
||||
| cdef double* m
|
||||
| cdef Point* r
|
||||
| cdef Point* v
|
||||
| cdef Point* F
|
||||
| cdef readonly double dt
|
||||
| def __init__(self, N, threads=1, m_min=1, m_max=30.0, r_max=50.0, v_max=4.0, dt=1e-3):
|
||||
| self.mem = Pool()
|
||||
| self.N = N
|
||||
| self.m = <double*>self.mem.alloc(N, sizeof(double))
|
||||
| self.r = <Point*>self.mem.alloc(N, sizeof(Point))
|
||||
| self.v = <Point*>self.mem.alloc(N, sizeof(Point))
|
||||
| self.F = <Point*>self.mem.alloc(N, sizeof(Point))
|
||||
| for i in range(N):
|
||||
| self.m[i] = random.uniform(m_min, m_max)
|
||||
| self.r[i].x = random.uniform(-r_max, r_max)
|
||||
| self.r[i].y = random.uniform(-r_max, r_max)
|
||||
| self.v[i].x = random.uniform(-v_max, v_max)
|
||||
| self.v[i].y = random.uniform(-v_max, v_max)
|
||||
| self.F[i].x = 0
|
||||
| self.F[i].y = 0
|
||||
| self.dt = dt
|
||||
|
|
||||
|
|
||||
| @cython.cdivision(True)
|
||||
| def compute_F(World w):
|
||||
| """Compute the force on each body in the world, w."""
|
||||
| cdef int i, j
|
||||
| cdef double s3, tmp
|
||||
| cdef Point s
|
||||
| cdef Point F
|
||||
| for i in range(w.N):
|
||||
| # Set all forces to zero.
|
||||
| w.F[i].x = 0
|
||||
| w.F[i].y = 0
|
||||
| for j in range(i+1, w.N):
|
||||
| s.x = w.r[j].x - w.r[i].x
|
||||
| s.y = w.r[j].y - w.r[i].y
|
||||
|
|
||||
| s3 = sqrt(s.x * s.x + s.y * s.y)
|
||||
| s3 *= s3 * s3;
|
||||
|
|
||||
| tmp = w.m[i] * w.m[j] / s3
|
||||
| F.x = tmp * s.x
|
||||
| F.y = tmp * s.y
|
||||
|
|
||||
| w.F[i].x += F.x
|
||||
| w.F[i].y += F.y
|
||||
|
|
||||
| w.F[j].x -= F.x
|
||||
| w.F[j].y -= F.y
|
||||
|
|
||||
|
|
||||
| @cython.cdivision(True)
|
||||
| def evolve(World w, int steps):
|
||||
| """Evolve the world, w, through the given number of steps."""
|
||||
| cdef int _, i
|
||||
| for _ in range(steps):
|
||||
| compute_F(w)
|
||||
| for i in range(w.N):
|
||||
| w.v[i].x += w.F[i].x * w.dt / w.m[i]
|
||||
| w.v[i].y += w.F[i].y * w.dt / w.m[i]
|
||||
| w.r[i].x += w.v[i].x * w.dt
|
||||
| w.r[i].y += w.v[i].y * w.dt
|
||||
|
||||
p The Cython version took about 30 minutes to write, and it runs just as fast as the C code — because, why wouldn’t it? It *is* C code, really, with just some syntactic sugar. And you don’t even have to learn or think about a foreign, complicated C API…You just, write C. Or C++ — although that’s a little more awkward. Both the Cython version and the C version are about 70x faster than the pure Python version, which uses Numpy arrays.
|
||||
|
||||
p One difference from C: I wrote a little wrapper around malloc/free, #[a(href="https://github.com/syllog1sm/cymem" target="_blank") cymem]. All it does is remember the addresses it served, and when the Pool is garbage collected, it frees the memory it allocated. I’ve had no trouble with memory leaks since I started using this.
|
||||
|
||||
p The “intermediate” way of writing Cython, using typed memory-views, allows you to use the Numpy multi-dimensional array features. However, to me it feels more complicated, and the applications I tend to write involve very sparse arrays — where, once again, I want to define my own data structures.
|
||||
|
||||
p.box.infobox #[strong.note Note:] I found a Russian translation of this post #[a(href="http://habrahabr.ru/company/mailru/blog/242533/" rel="nofollow" target="_blank") here]. I don’t know how accurate it is.
|
15
website/src/jade/blog/writing-c-in-cython/meta.jade
Normal file
15
website/src/jade/blog/writing-c-in-cython/meta.jade
Normal file
|
@ -0,0 +1,15 @@
|
|||
- var Meta = {}
|
||||
- Meta.author_id = "matt"
|
||||
- Meta.headline = "Writing C in Cython"
|
||||
- Meta.description = "For the last two years, I’ve done almost all of my work in Cython. And I don’t mean, I write Python, and then “Cythonize” it, with various type-declarations et cetera. I just, write Cython. I use \"raw\" C structs and arrays, and occasionally C++ vectors, with a thin wrapper around malloc/free that I wrote myself. The code is almost always exactly as fast as C/C++, because that's really all it is, but with Python right there, if I want it."
|
||||
- Meta.date = "2014-10-21"
|
||||
- Meta.url = "/blog/writing-c-in-cython"
|
||||
- Meta.links = [{}, {}]
|
||||
- Meta.links[0].id = 'reddit'
|
||||
- Meta.links[0].name = "Reddit"
|
||||
- Meta.links[0].title = 'Reddit Thread'
|
||||
- Meta.links[0].url = "https://www.reddit.com/r/Python/comments/2jvdw9/writing_c_in_cython/"
|
||||
- Meta.links[1].id = 'hn'
|
||||
- Meta.links[1].name = "Hacker News Thread"
|
||||
- Meta.links[1].title = 'Hacker News'
|
||||
- Meta.links[1].url = "https://news.ycombinator.com/item?id=8483872"
|
606
website/src/jade/docs/_api.html
Normal file
606
website/src/jade/docs/_api.html
Normal file
|
@ -0,0 +1,606 @@
|
|||
|
||||
<!-- TODO-->
|
||||
<!-- Doc-->
|
||||
<!-- to_array-->
|
||||
<!-- count_by-->
|
||||
<!-- from_array-->
|
||||
<!-- from_bytes-->
|
||||
<!-- to_bytes-->
|
||||
<!-- read_bytes-->
|
||||
<!-- -->
|
||||
<!-- Token-->
|
||||
<!-- Constructors-->
|
||||
<!-- Examples for repvec. Rename?-->
|
||||
<!-- Link Simple Good Turing in prob-->
|
||||
<!---->
|
||||
<!-- Span-->
|
||||
<!-- Constructors-->
|
||||
<!-- Convert details to Define lists-->
|
||||
<!-- Styling of elements in Parse. Improve Span.root documentation-->
|
||||
<!-- -->
|
||||
<!-- Lexeme-->
|
||||
<!-- Constructors-->
|
||||
<!---->
|
||||
<!-- Vocab-->
|
||||
<!-- Constructors-->
|
||||
<!---->
|
||||
<!-- StringStore-->
|
||||
<!-- Constructors-->
|
||||
<details>
|
||||
<summary><a name="pipeline"><span class="declaration"><span class="label">class</span><code>English</code></span></a></summary>
|
||||
<p>Load models into a callable object to process English text. Intended use is for one instance to be created per process. You can create more if you're doing something unusual. You may wish to make the instance a global variable or "singleton". We usually instantiate the object in the <code>main()</code> function and pass it around as an explicit argument. </p>
|
||||
<pre class="language-python"><code>from spacy.en import English
|
||||
from spacy._doc_examples import download_war_and_peace
|
||||
|
||||
unprocessed_unicode = download_war_and_peace()
|
||||
|
||||
nlp = English()
|
||||
doc = nlp(unprocessed_unicode)</code></pre>
|
||||
<details open="open">
|
||||
<summary><a><span class="declaration"><code>__init__</code><span class="parameters">self, data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True</span></span></a></summary>
|
||||
<p>Load the resources. Loading takes 20 seconds, and the instance consumes 2 to 3 gigabytes of memory.</p>
|
||||
<p>Load data from default directory:</p>
|
||||
<pre class="language-python"><code>>>> nlp = English()
|
||||
>>> nlp = English(data_dir=u'')</code></pre>
|
||||
<p>Load data from specified directory:</p>
|
||||
<pre class="language-python"><code>>>> nlp = English(data_dir=u'path/to/data_directory')</code></pre>
|
||||
<p>Disable (and avoid loading) parts of the processing pipeline:</p>
|
||||
<pre class="language-python"><code>>>> nlp = English(load_vectors=False, Parser=False, Tagger=False, Entity=False)</code></pre>
|
||||
<p>Start with nothing loaded:</p>
|
||||
<pre class="language-python"><code>>>> nlp = English(data_dir=None)</code></pre>
|
||||
<ul>
|
||||
<li><strong>data_dir</strong> –
|
||||
|
||||
The data directory. May be , to disable any data loading (including the vocabulary).
|
||||
</li>
|
||||
<li><strong>Tagger</strong> – A class/function that creates the part-of-speech tagger. Usually this is left <code>True</code>, to load the default tagger. If falsey, no tagger is loaded.
|
||||
<p>You can also supply your own class/function, which will be called once on setup. The returned function will then be called in <code>English.__call__</code>. The function passed must accept two arguments, of types <code>(StringStore, directory)</code>, and produce a function that accepts one argument, of type <code>Doc</code>. Its return type is unimportant.</p>
|
||||
</li>
|
||||
<li><strong>Parser</strong> – A class/function that creates the syntactic dependency parser. Usually this is left <code>True</code>, to load the default tagger. If falsey, no parser is loaded.
|
||||
<p>You can also supply your own class/function, which will be called once on setup. The returned function will then be called in <code>English.__call__</code>. The function passed must accept two arguments, of types <code>(StringStore, directory)</code>, and produce a function that accepts one argument, of type <code>Doc</code>. Its return type is unimportant.</p>
|
||||
</li>
|
||||
<li><strong>Entity</strong> – A class/function that creates the named entity recogniser. Usually this is left <code>True</code>, to load the default tagger. If falsey, no entity recognizer is loaded.
|
||||
<p>You can also supply your own class/function, which will be called once on setup. The returned function will then be called in <code>English.__call__</code>. The function passed must accept two arguments, of types <code>(StringStore, directory)</code>, and produce a function that accepts one argument, of type <code>Doc</code>. Its return type is unimportant.</p>
|
||||
</li>
|
||||
<li><strong>load_vectors</strong> –
|
||||
|
||||
A boolean value to control whether the word vectors are loaded.
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
<details open="true">
|
||||
<summary><a name="English-__call__"><span class="declaration"><code>__call__</code><span class="parameters">text, tag=True, parse=True, entity=True</span></span></a></summary>
|
||||
<p>The main entry point to spaCy. Takes raw unicode text, and returns a <code>Doc</code> object, which can be iterated to access <code>Token</code> and <code>Span</code> objects. spaCy's models are all linear-time, so you can supply documents of arbitrary length, e.g. whole novels.</p>
|
||||
<ul>
|
||||
<li><strong>text</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>) –The text to be processed. spaCy expects raw unicode txt – you don't necessarily need to, say, split it into paragraphs. However, depending on your documents, you might be better off applying custom pre-processing. Non-text formatting, e.g. from HTML mark-up, should be removed before sending the document to spaCy. If your documents have a consistent format, you may be able to improve accuracy by pre-processing. For instance, if the first word of your documents are always in upper-case, it may be helpful to normalize them before supplying them to spaCy.
|
||||
</li>
|
||||
<li><strong>tag</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#bool"><em>bool</em></a>) –Whether to apply the part-of-speech tagger. Required for parsing and entity recognition.
|
||||
</li>
|
||||
<li><strong>parse</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#bool"><em>bool</em></a>) – Whether to apply the syntactic dependency parser.
|
||||
</li>
|
||||
<li><strong>entity</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#bool"><em>bool</em></a>) –Whether to apply the named entity recognizer.
|
||||
</li>
|
||||
</ul>
|
||||
<pre class="language-python"><code>from spacy.en import English
|
||||
nlp = English()
|
||||
doc = nlp(u'Some text.) # Applies tagger, parser, entity
|
||||
doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
|
||||
doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
|
||||
doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
|
||||
doc = nlp(u'') # Zero-length tokens, not an error
|
||||
# doc = nlp(b'Some text') <-- Error: need unicode
|
||||
doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.</code></pre>
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary><a name="doc"><span class="declaration"><span class="label">class</span><code>Doc</code></span></a></summary>
|
||||
<p>A sequence of <code>Token</code> objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary strings.</p>
|
||||
<p>Internally, the <code>Doc</code> object holds an array of <code>TokenC</code> structs. The Python-level <code>Token</code> and <code>Span</code> objects are views of this array, i.e. they don't own the data themselves. This details of the internals shouldn't matter for the API – but it may help you read the code, and understand how spaCy is designed.</p>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Constructors</h4>
|
||||
</summary><a href="#English-__call__"><span class="declaration"><span class="label">via</span><code>English.__call__(unicode text)</code></span></a>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><code>__init__</code><span class="parameters">self, vocab, orth_and_spaces=None</span></span></a></summary> This method of constructing a <code>Doc</code> object is usually only used for deserialization. Standard usage is to construct the document via a call to the language object.
|
||||
<ul>
|
||||
<li><strong>vocab</strong> – A Vocabulary object, which must match any models you want to use (e.g. tokenizer, parser, entity recognizer).
|
||||
</li>
|
||||
<li><strong>orth_and_spaces</strong> – A list of <code>(orth_id, has_space)</code> tuples, where <code>orth_id</code> is an integer, and has_space is a boolean, indicating whether the token has a trailing space.
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Sequence API</h4>
|
||||
</summary>
|
||||
<li><span class="declaration"><code>doc[i]</code></span> Get the <code>Token</code> object at position <code>i</code>, where <code>i</code> is an integer. Negative indexing is supported, and follows the usual Python semantics, i.e. <code>doc[-2]</code> is <code>doc[len(doc) - 2]</code>.
|
||||
</li>
|
||||
<li><span class="declaration"><code>doc[start : end]</code></span> Get a <code>Span</code> object, starting at position <code>start</code> and ending at position <code>end</code>. For instance, <code>doc[2:5]</code> produces a span consisting of tokens 2, 3 and 4. Stepped slices (e.g. <code>doc[start : end : step]</code>) are not supported, as <code>Span</code> objects must be contiguous (cannot have gaps).
|
||||
</li>
|
||||
<li><span class="declaration"><code>for token in doc</code></span>Iterate over <code>Token </code> objects, from which the annotations can be easily accessed. This is the main way of accessing <code>Token</code> objects, which are the main way annotations are accessed from Python. If faster-than-Python speeds are required, you can instead access the annotations as a numpy array, or access the underlying C data directly from Cython, via <code>Doc.data</code>, an array of <code>TokenC</code> structs. The C API has not yet been finalized, and is subject to change.
|
||||
</li>
|
||||
<li><span class="declaration"><code>len(doc)</code></span> The number of tokens in the document.
|
||||
</li>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Sentence, entity and noun chunk spans</h4>
|
||||
</summary>
|
||||
<details>
|
||||
<summary><span class="declaration"><code>sents</code></span></summary>
|
||||
<p> Yields sentence <code>Span</code> objects. Iterate over the span to get individual <code>Token</code> objects. Sentence spans have no label.
|
||||
<pre class="language-python"><code>>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
>>> doc = nlp(u'This is a sentence. Here's another...')
|
||||
>>> for sentence in doc.sents:
|
||||
... sentence.root.orth_
|
||||
is
|
||||
's</code></pre>
|
||||
</p>
|
||||
</details>
|
||||
<details>
|
||||
<summary><span class="declaration"><code>ents</code></span></summary>
|
||||
<p> Yields named-entity <code>Span</code> objects. Iterate over the span to get individual <code>Token</code> objects, or access the label:
|
||||
<pre><code>>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||
>>> ents = list(tokens.ents)
|
||||
>>> ents[0].label, ents[0].label_, ents[0].orth_, ents[0].string
|
||||
(112504, 'PERSON', 'Best', ents[0].string) </code></pre>
|
||||
</p>
|
||||
</details>
|
||||
<details>
|
||||
<summary><span class="declaration"><code>noun_chunks</code></span></summary>
|
||||
<p> Yields base noun-phrase <code>Span </code> objects. A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. For example:
|
||||
<pre class="language-python"><code>>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
>>> doc = nlp('The sentence in this example has three noun chunks.')
|
||||
>>> for chunk in doc.noun_chunks:
|
||||
... print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
|
||||
NP The sentence <-- has
|
||||
NP this example <-- in
|
||||
NP three noun chunks <-- has</code></pre>
|
||||
</p>
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Export/Import</h4>
|
||||
</summary>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><code>to_array</code><span class="parameters">attr_ids</span></span></a></summary>Given a list of M attribute IDs, export the tokens to a numpy ndarray of shape N*M, where N is the length of the sentence.
|
||||
<ul>
|
||||
<li><strong>attr_ids</strong> (list[int]) –A list of attribute ID ints. Attribute IDs can be imported from <code>spacy.attrs</code>
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><code>count_by</code><span class="parameters">attr_id</span></span></a></summary>Produce a dict of <code>{attribute (int): count (ints)}</code> frequencies, keyed by the values of the given attribute ID.
|
||||
<pre class="language-python"><code>>>> from spacy.en import English, attrs
|
||||
>>> nlp = English()
|
||||
>>> tokens = nlp(u'apple apple orange banana')
|
||||
>>> tokens.count_by(attrs.ORTH)
|
||||
{12800L: 1, 11880L: 2, 7561L: 1}
|
||||
>>> tokens.to_array([attrs.ORTH])
|
||||
array([[11880],
|
||||
[11880],
|
||||
[7561],
|
||||
[12800]])</code></pre>
|
||||
</details>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><code>from_array</code><span class="parameters">attrs, array</span></span></a></summary>Write to a <code>Doc</code> object, from an M*N array of attributes.
|
||||
</details>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><code>from_bytes</code><span class="parameters"></span></span></a></summary>Deserialize, loading from bytes.
|
||||
</details>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><code>to_bytes</code><span class="parameters"></span></span></a></summary>Serialize, producing a byte string.
|
||||
</details>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><code>read_bytes</code><span class="parameters"></span></span></a></summary>classmethod
|
||||
</details>
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary><a name="token"><span class="declaration"><span class="label">class</span><code>Token</code></span></a></summary>A Token represents a single word, punctuation or significant whitespace symbol. Integer IDs are provided for all string features. The (unicode) string is provided by an attribute of the same name followed by an underscore, e.g. <code>token.orth</code> is an integer ID, <code>token.orth_</code> is the unicode value. The only exception is the Token.string attribute, which is (unicode) string-typed.
|
||||
<details>
|
||||
<summary>
|
||||
<h4>String Features</h4>
|
||||
</summary>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>lemma / lemma_</code></span>The "base" of the word, with no inflectional suffixes, e.g. the lemma of "developing" is "develop", the lemma of "geese" is "goose", etc. Note that <em>derivational</em> suffixes are not stripped, e.g. the lemma of "instutitions" is "institution", not "institute". Lemmatization is performed using the WordNet data, but extended to also cover closed-class words such as pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his". We assign pronouns the lemma <code>-PRON-</code>.
|
||||
</li>
|
||||
</ul>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>orth / orth_</code></span>The form of the word with no string normalization or processing, as it appears in the string, without trailing whitespace.
|
||||
</li>
|
||||
<li><span class="declaration"><code>lower / lower_</code></span>The form of the word, but forced to lower-case, i.e. <code class="language-python">lower = word.orth_.lower()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>shape / shape_</code></span>A transform of the word's string, to show orthographic features. The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. After these mappings, sequences of 4 or more of the same character are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, :) --> :)
|
||||
</li>
|
||||
<li><span class="declaration"><code>prefix / prefix_</code></span>A length-N substring from the start of the word. Length may vary by language; currently for English n=1, i.e. <code class="language-python">prefix = word.orth_[:1]</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>suffix / suffix_</code></span>A length-N substring from the end of the word. Length may vary by language; currently for English n=3, i.e. <code class="language-python">suffix = word.orth_[-3:]</code>
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Boolean Flags</h4>
|
||||
</summary>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>is_alpha</code></span> Equivalent to <code class="language-python">word.orth_.isalpha()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_ascii</code></span> Equivalent to <code class="language-python">any(ord(c) >= 128 for c in word.orth_)</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_digit</code></span> Equivalent to <code class="language-python">word.orth_.isdigit()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_lower</code></span> Equivalent to <code class="language-python">word.orth_.islower()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_title</code></span> Equivalent to <code class="language-python">word.orth_.istitle()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_punct</code></span> Equivalent to <code class="language-python">word.orth_.ispunct()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_space</code></span> Equivalent to <code class="language-python">word.orth_.isspace()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>like_url</code></span> Does the word resembles a URL?
|
||||
</li>
|
||||
<li><span class="declaration"><code>like_num</code></span> Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc
|
||||
</li>
|
||||
<li><span class="declaration"><code>like_email</code></span> Does the word resemble an email?
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_oov</code></span> Is the word out-of-vocabulary?
|
||||
</li>
|
||||
</ul>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><code>check_flag</code><span class="parameters">flag_id</span></span></a></summary>Get the value of one of the boolean flags
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Distributional Features</h4>
|
||||
</summary>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>prob</code></span> The unigram log-probability of the word, estimated from counts from a large corpus, smoothed using Simple Good Turing estimation.
|
||||
</li>
|
||||
<li><span class="declaration"><code>cluster</code></span> The Brown cluster ID of the word. These are often useful features for linear models. If you’re using a non-linear model, particularly a neural net or random forest, consider using the real-valued word representation vector, in Token.repvec, instead.
|
||||
</li>
|
||||
<li><span class="declaration"><code>repvec</code></span> A “word embedding” representation: a dense real-valued vector that supports similarity queries between words. By default, spaCy currently loads vectors produced by the Levy and Goldberg (2014) dependency-based word2vec model.
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Alignment and Output</h4>
|
||||
</summary>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>idx</code></span>Start index of the token in the string
|
||||
</li>
|
||||
<li><span class="declaration"><code>len(token)</code></span>Length of the token's orth string, in unicode code-points.
|
||||
</li>
|
||||
<li><span class="declaration"><code>unicode(token)</code></span>Same as token.orth_
|
||||
</li>
|
||||
<li><span class="declaration"><code>str(token)</code></span>In Python 3, returns <code>token.orth_</code>. In Python 2, returns<code>token.orth_.encode('utf8')</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>string</code></span><code>token.orth_ + token.whitespace_</code>, i.e. the form of the word as it appears in the string,
|
||||
<including>trailing whitespace</including>. This is useful when you need to use linguistic features to add inline mark-up to the string.
|
||||
</li>
|
||||
<li><span class="declaration"><code>whitespace_</code></span>The number of immediate syntactic children following the word in the string.
|
||||
</li>
|
||||
</ul>
|
||||
<define>
|
||||
<summary>
|
||||
<h4>Navigating the Parse Tree</h4>
|
||||
</summary>
|
||||
<li><span class="declaration"><code>head</code></span>The immediate syntactic head of the token. If the token is the root of its sentence, it is the token itself, i.e. <code>root_token.head is root_token</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>children</code></span>An iterator that yields from lefts, and then yields from rights.
|
||||
</li>
|
||||
<li><span class="declaration"><code>subtree</code></span>An iterator for the part of the sentence syntactically governed by the word, including the word itself.
|
||||
</li>
|
||||
<li><span class="declaration"><code>left_edge</code></span>The leftmost edge of the token's subtree
|
||||
</li>
|
||||
<li><span class="declaration"><code>right_edge</code></span>The rightmost edge of the token's subtree
|
||||
</li>
|
||||
</define>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><code>nbor(i=1)</code><span class="parameters"></span></span></a></summary>Get the <em>i</em>th next / previous neighboring token.
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Named Entities</h4>
|
||||
</summary>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>ent_type</code></span>If the token is part of an entity, its entity type.
|
||||
</li>
|
||||
<li><span class="declaration"><code>ent_iob</code></span>The IOB (inside, outside, begin) entity recognition tag for the token.
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Constructors</h4>
|
||||
</summary>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><code>__init__</code><span class="parameters">vocab, doc, offset</span></span></a></summary>
|
||||
<ul>
|
||||
<li><strong>vocab</strong> –A Vocab object
|
||||
</li>
|
||||
<li><strong>doc</strong> –The parent sequence
|
||||
</li>
|
||||
<li><strong>offset</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#int"><em>int</em></a>) –The index of the token within the document
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
<!--+attribute("conjuncts")-->
|
||||
<!-- | Conjuncts-->
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary><a name="span"><span class="declaration"><span class="label">class</span><code>Span</code></span></a></summary>A <code>Span</code> is a slice of a <code>Doc</code> object, consisting of zero or more tokens. Spans are used to represent sentences, named entities, phrases, and arbitrary contiguous slices from the <code>Doc</code> object. <code>Span</code> objects are views – that is, they do not copy the underlying C data. This makes them cheap to construct, as internally are simply a reference to the <code>Doc</code> object, a start position, an end position, and a label ID.
|
||||
<li><span class="declaration"><code>token = span[i]</code></span>Get the <code>Token</code> object at position <em>i</em>, where <em>i</em> is an offset within the <code>Span</code>, not the document. That is:
|
||||
<pre class="language-python"><code>span = doc[4:6]
|
||||
token = span[0]
|
||||
assert token.i == 4</code></pre>
|
||||
</li>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>for token in span</code></span>Iterate over the <code>Token</code> objects in the span.
|
||||
</li>
|
||||
<li><span class="declaration"><code>__len__</code></span>Number of tokens in the span.
|
||||
</li>
|
||||
<li><span class="declaration"><code>start</code></span>The start offset of the span, i.e. <code class="language-python">span[0].i</code>.
|
||||
</li>
|
||||
<li><span class="declaration"><code>end</code></span>The end offset of the span, i.e. <code class="language-python">span[-1].i + 1</code>
|
||||
</li>
|
||||
</ul>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Navigating the Parse Tree</h4>
|
||||
</summary>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>root</code></span>The first ancestor of the first word of the span that has its head outside the span. For example:
|
||||
<pre class="language-python"><code>>>> toks = nlp(u'I like New York in Autumn.')</code></pre>
|
||||
<p>Let's name the indices --- easier than writing <code>toks[4]</code> etc.</p>
|
||||
<pre class="language-python"><code>>>> i, like, new, york, in_, autumn, dot = range(len(toks)) </code></pre>
|
||||
<p>The head of <em>new</em> is <em>York</em>, and the head of <em>York</em> is <em>like</em></p>
|
||||
<pre class="language-python"><code>>>> toks[new].head.orth_
|
||||
'York'
|
||||
>>> toks[york].head.orth_
|
||||
'like'</code></pre>
|
||||
<p>Create a span for "New York". Its root is "York".</p>
|
||||
<pre class="language-python"><code>>>> new_york = toks[new:york+1]
|
||||
>>> new_york.root.orth_
|
||||
'York'</code></pre>
|
||||
<p>When there are multiple words with external dependencies, we take the first:</p>
|
||||
<pre class="language-python"><code>>>> toks[autumn].head.orth_, toks[dot].head.orth_
|
||||
('in', like')
|
||||
>>> autumn_dot = toks[autumn:]
|
||||
>>> autumn_dot.root.orth_
|
||||
'Autumn'</code></pre>
|
||||
</li>
|
||||
<li><span class="declaration"><code>lefts</code></span>Tokens that are to the left of the span, whose head is within the span, i.e. <code class="language-python">
|
||||
lefts = [span.doc[i] for i in range(0, span.start)
|
||||
if span.doc[i].head in span]</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>rights</code></span>Tokens that are to the right of the span, whose head is within the span, i.e.
|
||||
<pre class="language-python"><code>rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
||||
if span.doc[i].head in span]</code></pre>
|
||||
</li>
|
||||
</ul>
|
||||
<li><span class="declaration"><code>subtree</code></span>Tokens in the range <code>(start, end+1)</code>, where <code>start</code> is the index of the leftmost word descended from a token in the span, and <code>end</code> is the index of the rightmost token descended from a token in the span.
|
||||
</li>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Constructors</h4>
|
||||
</summary>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>doc[start : end]</code></span>
|
||||
</li>
|
||||
<li><span class="declaration"><code>for entity in doc.ents</code></span>
|
||||
</li>
|
||||
<li><span class="declaration"><code>for sentence in doc.sents</code></span>
|
||||
</li>
|
||||
<li><span class="declaration"><code>for noun_phrase in doc.noun_chunks</code></span>
|
||||
</li>
|
||||
<li><span class="declaration"><code>span = Span(doc, start, end, label=0)</code></span>
|
||||
</li>
|
||||
</ul>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><code>__init__</code><span class="parameters"></span></span></a></summary>Temp <code>span = doc[0:4]</code>
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>String Views</h4>
|
||||
</summary>
|
||||
<details open="open">
|
||||
<summary><span class="declaration"><code>string</code></span></summary>
|
||||
<p>String
|
||||
</p>
|
||||
</details>
|
||||
<details open="open">
|
||||
<summary><span class="declaration"><code>lemma / lemma_</code></span></summary>
|
||||
<p>String
|
||||
</p>
|
||||
</details>
|
||||
<details open="open">
|
||||
<summary><span class="declaration"><code>label / label_</code></span></summary>
|
||||
<p>String
|
||||
</p>
|
||||
</details>
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary><a name="lexeme"><span class="declaration"><span class="label">class</span><code>Lexeme</code></span></a></summary>
|
||||
<p>The Lexeme object represents a lexical type, stored in the vocabulary – as opposed to a token, occurring in a document.</p>
|
||||
<p>Lexemes store various features, so that these features can be computed once per type, rather than once per token. As job sizes grow, this can amount to a substantial efficiency improvement.</p>
|
||||
<p>All Lexeme attributes are therefore context independent, as a single lexeme is reused for all usages of that word. Lexemes are keyed by the “orth” attribute. </p>
|
||||
<p>All Lexeme attributes are accessible directly on the Token object.</p>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>String Features</h4>
|
||||
</summary>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>orth / orth_</code></span>The form of the word with no string normalization or processing, as it appears in the string, without trailing whitespace.
|
||||
</li>
|
||||
<li><span class="declaration"><code>lower / lower_</code></span>The form of the word, but forced to lower-case, i.e. <code class="language-python">lower = word.orth_.lower()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>shape / shape_</code></span>A transform of the word's string, to show orthographic features. The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. After these mappings, sequences of 4 or more of the same character are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, :) --> :)
|
||||
</li>
|
||||
<li><span class="declaration"><code>prefix / prefix_</code></span>A length-N substring from the start of the word. Length may vary by language; currently for English n=1, i.e. <code class="language-python">prefix = word.orth_[:1]</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>suffix / suffix_</code></span>A length-N substring from the end of the word. Length may vary by language; currently for English n=3, i.e. <code class="language-python">suffix = word.orth_[-3:]</code>
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Boolean Features</h4>
|
||||
</summary>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>is_alpha</code></span> Equivalent to <code class="language-python">word.orth_.isalpha()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_ascii</code></span> Equivalent to <code class="language-python">any(ord(c) >= 128 for c in word.orth_)</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_digit</code></span> Equivalent to <code class="language-python">word.orth_.isdigit()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_lower</code></span> Equivalent to <code class="language-python">word.orth_.islower()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_title</code></span> Equivalent to <code class="language-python">word.orth_.istitle()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_punct</code></span> Equivalent to <code class="language-python">word.orth_.ispunct()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_space</code></span> Equivalent to <code class="language-python">word.orth_.isspace()</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>like_url</code></span> Does the word resembles a URL?
|
||||
</li>
|
||||
<li><span class="declaration"><code>like_num</code></span> Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc
|
||||
</li>
|
||||
<li><span class="declaration"><code>like_email</code></span> Does the word resemble an email?
|
||||
</li>
|
||||
<li><span class="declaration"><code>is_oov</code></span> Is the word out-of-vocabulary?
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Distributional Features</h4>
|
||||
</summary>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>prob</code></span> The unigram log-probability of the word, estimated from counts from a large corpus, smoothed using Simple Good Turing estimation.
|
||||
</li>
|
||||
<li><span class="declaration"><code>cluster</code></span> The Brown cluster ID of the word. These are often useful features for linear models. If you’re using a non-linear model, particularly a neural net or random forest, consider using the real-valued word representation vector, in Token.repvec, instead.
|
||||
</li>
|
||||
<li><span class="declaration"><code>repvec</code></span> A “word embedding” representation: a dense real-valued vector that supports similarity queries between words. By default, spaCy currently loads vectors produced by the Levy and Goldberg (2014) dependency-based word2vec model.
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Constructors</h4>
|
||||
</summary>
|
||||
<details open="open">
|
||||
<summary><a><span class="declaration"><code>__init__</code><span class="parameters"></span></span></a></summary>
|
||||
<p>Init</p>
|
||||
</details>
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><span class="label">class</span><code>Vocab</code></span></a></summary>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>lexeme = vocab[integer_id]</code></span>Get a lexeme by its orth ID
|
||||
</li>
|
||||
<li><span class="declaration"><code>lexeme = vocab[string]</code></span>Get a lexeme by the string corresponding to its orth ID.
|
||||
</li>
|
||||
<li><span class="declaration"><code>for lexeme in vocab</code></span>Iterate over <code>Lexeme</code> objects
|
||||
</li>
|
||||
<li><span class="declaration"><code>vocab[integer_id] = attributes_dict</code></span>A props dictionary
|
||||
</li>
|
||||
<li><span class="declaration"><code>len(vocab)</code></span>Number of lexemes (unique words) in the
|
||||
</li>
|
||||
</ul>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Constructors</h4>
|
||||
</summary>
|
||||
<details open="open">
|
||||
<summary><a><span class="declaration"><code>__init__</code><span class="parameters"></span></span></a></summary>Tmp
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Save and Load</h4>
|
||||
</summary>
|
||||
<details open="open">
|
||||
<summary><a><span class="declaration"><code>dump</code><span class="parameters">loc</span></span></a></summary>
|
||||
<ul>
|
||||
<li><strong>loc</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>) –Path where the vocabulary should be saved
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
<details open="open">
|
||||
<summary><a><span class="declaration"><code>load_lexemes</code><span class="parameters">loc</span></span></a></summary>
|
||||
<ul>
|
||||
<li><strong>loc</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>) –Path to load the lexemes.bin file from
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
<details open="open">
|
||||
<summary><a><span class="declaration"><code>load_vectors</code><span class="parameters">loc</span></span></a></summary>
|
||||
<ul>
|
||||
<li><strong>loc</strong> (<a class="reference" href="http://docs.python.org/library/functions.html#unicode"><em>unicode</em></a>) –Path to load the vectors.bin from
|
||||
</li>
|
||||
</ul>
|
||||
</details>
|
||||
</details>
|
||||
</details>
|
||||
<details>
|
||||
<summary><a><span class="declaration"><span class="label">class</span><code>StringStore</code></span></a></summary>
|
||||
<p>Intern strings, and map them to sequential integer IDs. The mapping table is very efficient , and a small-string optimization is used to maintain a small memory footprint. Only the integer IDs are held by spaCy's data classes (<code>Doc</code>, <code>Token</code>, <code>Span</code> and <code>Lexeme</code>) – when you use a string-valued attribute like <code>token.orth_</code>, you access a property that computes <code>token.strings[token.orth]</code>.</p>
|
||||
<ul>
|
||||
<li><span class="declaration"><code>string = string_store[int_id]</code></span>Retrieve a string from a given integer ID. If the integer ID is not found, raise <code>IndexError</code>
|
||||
</li>
|
||||
<li><span class="declaration"><code>int_id = string_store[unicode_string]</code></span> Map a unicode string to an integer ID. If the string is previously unseen, it is interned, and a new ID is returned.
|
||||
</li>
|
||||
<li><span class="declaration"><code>int_id = string_store[utf8_byte_string]</code></span> Byte strings are assumed to be in UTF-8 encoding. Strings encoded with other codecs may fail silently. Given a utf8 string, the behaviour is the same as for unicode strings. Internally, strings are stored in UTF-8 format. So if you start with a UTF-8 byte string, it's less efficient to first decode it as unicode, as StringStore will then have to encode it as UTF-8 once again.
|
||||
</li>
|
||||
<li><span class="declaration"><code>n_strings = len(string_store)</code></span>Number of strings in the string-store
|
||||
</li>
|
||||
<li><span class="declaration"><code>for string in string_store</code></span>Iterate over strings in the string store, in order, such that the <em>i</em>th string in the sequence has the ID <em>i</em>:
|
||||
<pre class="language-python"><code>for i, string in enumerate(string_store):
|
||||
assert i == string_store[string]</code></pre>
|
||||
</li>
|
||||
</ul>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Constructors</h4>
|
||||
</summary>
|
||||
<p><code>StringStore.__init__</code> takes no arguments, so a new instance can be constructed as follows:</p>
|
||||
<pre class="language-python"><code>string_store = StringStore()</code></pre>
|
||||
<p>However, in practice you'll usually use the instance owned by the language's <code>vocab</code> object, which all classes hold a reference to:</p>
|
||||
<ul>
|
||||
<li><code class="language-python">english.vocab.strings</code></li>
|
||||
<li><code class="language-python">doc.vocab.strings</code></li>
|
||||
<li><code class="language-python">span.vocab.strings</code></li>
|
||||
<li><code class="language-python">token.vocab.strings</code></li>
|
||||
<li><code class="language-python">lexeme.vocab.strings</code></li>
|
||||
</ul>
|
||||
<p>If you create another instance, it will map strings to different integers – which is usually not what you want.</p>
|
||||
</details>
|
||||
<details>
|
||||
<summary>
|
||||
<h4>Save and Load</h4>
|
||||
</summary>
|
||||
<details open="open">
|
||||
<summary><a><span class="declaration"><code>dump</code><span class="parameters">loc</span></span></a></summary>
|
||||
<p>Save the strings mapping to the given location, in plain text. The format is subject to change; so if you need to read/write compatible files, please can find details in the <code>strings.pyx</code> source.</p>
|
||||
</details>
|
||||
<details open="open">
|
||||
<summary><a><span class="declaration"><code>load</code><span class="parameters">loc</span></span></a></summary>
|
||||
<p>Load the strings mapping from a plain-text file in the given location. The format is subject to change; so if you need to read/write compatible files, please can find details in the <code>strings.pyx</code> source.</p>
|
||||
</details>
|
||||
</details>
|
||||
</details>
|
701
website/src/jade/docs/_api.jade
Normal file
701
website/src/jade/docs/_api.jade
Normal file
|
@ -0,0 +1,701 @@
|
|||
mixin declare_class(name, ref)
|
||||
details
|
||||
summary
|
||||
a(name=ref)
|
||||
span.declaration
|
||||
span.label class
|
||||
code #{name}
|
||||
block
|
||||
|
||||
mixin method(name, parameters, link_name)
|
||||
details(open=attributes.open)
|
||||
summary
|
||||
a(name=link_name)
|
||||
span.declaration
|
||||
code= name
|
||||
span.parameters
|
||||
| #{parameters}
|
||||
block
|
||||
|
||||
mixin params
|
||||
ul
|
||||
block
|
||||
|
||||
mixin param(name, type, value)
|
||||
li
|
||||
if type
|
||||
<strong>#{name}</strong> (!{type}) –
|
||||
else
|
||||
<strong>#{name}</strong> –
|
||||
block
|
||||
|
||||
mixin attribute(name, type, value)
|
||||
details(open=attributes.open)
|
||||
summary
|
||||
span.declaration
|
||||
code= name
|
||||
p
|
||||
block
|
||||
|
||||
mixin returns(name, type, value)
|
||||
li
|
||||
if type
|
||||
<strong>#{name}</strong> (!{type}) –
|
||||
else
|
||||
<strong>#{name}</strong> –
|
||||
block
|
||||
|
||||
mixin returns(type)
|
||||
| tmp
|
||||
|
||||
mixin init
|
||||
details
|
||||
summary: h4 Constructors
|
||||
|
||||
block
|
||||
|
||||
mixin callable
|
||||
details
|
||||
summary: h4 Callable
|
||||
|
||||
block
|
||||
|
||||
mixin sequence
|
||||
details
|
||||
summary: h4 Sequence API
|
||||
|
||||
block
|
||||
|
||||
mixin maptype
|
||||
details
|
||||
summary: h4 Map
|
||||
|
||||
block
|
||||
|
||||
mixin summary
|
||||
block
|
||||
|
||||
mixin en_example
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| from spacy._doc_examples import download_war_and_peace
|
||||
|
|
||||
| unprocessed_unicode = download_war_and_peace()
|
||||
|
|
||||
| nlp = English()
|
||||
| doc = nlp(unprocessed_unicode)
|
||||
|
||||
mixin SeeAlso(name, link_target)
|
||||
a(href=link_target)
|
||||
span.declaration
|
||||
span.label via
|
||||
code= name
|
||||
|
||||
|
||||
mixin Define(term)
|
||||
li
|
||||
#[span.declaration #[code #{term}]]
|
||||
block
|
||||
|
||||
|
||||
|
||||
mixin LexemeBooleans()
|
||||
ul
|
||||
+Define("is_alpha")
|
||||
| Equivalent to #[code.language-python word.orth_.isalpha()]
|
||||
+Define("is_ascii")
|
||||
| Equivalent to #[code.language-python any(ord(c) >= 128 for c in word.orth_)]
|
||||
+Define("is_digit")
|
||||
| Equivalent to #[code.language-python word.orth_.isdigit()]
|
||||
+Define("is_lower")
|
||||
| Equivalent to #[code.language-python word.orth_.islower()]
|
||||
+Define("is_title")
|
||||
| Equivalent to #[code.language-python word.orth_.istitle()]
|
||||
+Define("is_punct")
|
||||
| Equivalent to #[code.language-python word.orth_.ispunct()]
|
||||
+Define("is_space")
|
||||
| Equivalent to #[code.language-python word.orth_.isspace()]
|
||||
+Define("like_url")
|
||||
| Does the word resembles a URL?
|
||||
+Define("like_num")
|
||||
| Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc
|
||||
+Define("like_email")
|
||||
| Does the word resemble an email?
|
||||
+Define("is_oov")
|
||||
| Is the word out-of-vocabulary?
|
||||
+Define("is_stop")
|
||||
| Is the word part of a "stop list"? Stop lists are used to improve the quality of topic models, by filtering out common, domain-general words.
|
||||
|
||||
mixin LexemeStrings
|
||||
ul
|
||||
+Define("orth / orth_")
|
||||
| The form of the word with no string normalization or processing, as it appears in the string, without trailing whitespace.
|
||||
|
||||
+Define("lower / lower_")
|
||||
| The form of the word, but forced to lower-case, i.e. #[code.language-python lower = word.orth_.lower()]
|
||||
|
||||
+Define("shape / shape_")
|
||||
| A transform of the word's string, to show orthographic features. The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. After these mappings, sequences of 4 or more of the same character are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, :) --> :)
|
||||
+Define("prefix / prefix_")
|
||||
| A length-N substring from the start of the word. Length may vary by language; currently for English n=1, i.e. #[code.language-python prefix = word.orth_[:1]]
|
||||
|
||||
+Define("suffix / suffix_")
|
||||
| A length-N substring from the end of the word. Length may vary by language; currently for English n=3, i.e. #[code.language-python suffix = word.orth_[-3:]]
|
||||
|
||||
|
||||
mixin LexemeDistributional
|
||||
ul
|
||||
+Define("prob")
|
||||
| The unigram log-probability of the word, estimated from counts from a large corpus, smoothed using Simple Good Turing estimation.
|
||||
+Define("cluster")
|
||||
| The Brown cluster ID of the word. These are often useful features for linear models. If you’re using a non-linear model, particularly a neural net or random forest, consider using the real-valued word representation vector, in Token.repvec, instead.
|
||||
+Define("vector")
|
||||
| A “word embedding” representation: a dense real-valued vector that supports similarity queries between words. By default, spaCy currently loads vectors produced by the Levy and Goldberg (2014) dependency-based word2vec model.
|
||||
|
||||
|
||||
mixin Func(type1, type2)
|
||||
#{"λ " + type1 + ", " + type2}
|
||||
|
||||
// TODO
|
||||
// Doc
|
||||
// to_array
|
||||
// count_by
|
||||
// from_array
|
||||
// from_bytes
|
||||
// to_bytes
|
||||
// read_bytes
|
||||
//
|
||||
// Examples for repvec. Rename?
|
||||
|
||||
//- attr / parts_of_speech / entity_types / dependency_labels documentation
|
||||
//- Gazetteer?
|
||||
|
||||
- var py_docs = '<a class="reference" href="http://docs.python.org/library/'
|
||||
|
||||
-
|
||||
var types = {
|
||||
'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
|
||||
'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
|
||||
'int': py_docs + 'functions.html#int"><em>int</em></a>',
|
||||
'generator': "",
|
||||
'Vocab': "",
|
||||
'Span': "",
|
||||
'Doc': "",
|
||||
'StringStore': '#',
|
||||
'directory': '#'
|
||||
}
|
||||
|
||||
+declare_class("English", "pipeline")
|
||||
p Load models into a callable object to process English text. Intended use is for one instance to be created per process. You can create more if you're doing something unusual. You may wish to make the instance a global variable or "singleton". We usually instantiate the object in the #[code main()] function and pass it around as an explicit argument.
|
||||
|
||||
+summary
|
||||
+method("__init__", "self, data_dir=None, vocab=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None, serializer=None)")(open=true)
|
||||
p Load the linguistic analysis pipeline. Loading may take up to a minute, and the instance consumes 2 to 3 gigabytes of memory. The pipeline class is responsible for loading and saving the components, and applying them in sequence. Each component can be passed as an argument to the #[code __init__] function, or left as #[code None], in which case it will be loaded from a classmethod, named e.g. #[code default_vocab].
|
||||
|
||||
p Common usage is to accept all defaults, in which case loading is simply:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> nlp = spacy.en.English()
|
||||
|
||||
p To keep the default components, but load data from a specified directory, use:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> nlp = English(data_dir=u'path/to/data_directory')
|
||||
|
||||
p To disable (and avoid loading) parts of the processing pipeline:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> nlp = English(parser=False, tagger=False, entity=False)
|
||||
|
||||
+params
|
||||
+param("data_dir")
|
||||
| The data directory. If #[code None], value is obtained via the #[code default_data_dir()] method.
|
||||
|
||||
+param("vocab")
|
||||
| The #[code vocab] object, which should be an instance of class #[code spacy.vocab.Vocab]. If #[code None], the object is obtained from the #[code default_vocab()] class method. The #[code vocab] object manages all of the language specific rules and definitions, maintains the cache of lexical types, and manages the word vectors. Because the #[code vocab] owns this important data, most objects hold a reference to the #[code vocab].
|
||||
|
||||
+param("tokenizer")
|
||||
| The tokenizer, which should be a callable that accepts a unicode string, and returns a #[code Doc] object. If set to #[code None], the default tokenizer is constructed from the #[code default_tokenizer()] method.
|
||||
|
||||
+param("tagger")
|
||||
| The part-of-speech tagger, which should be a callable that accepts a #[code Doc] object, and sets the part-of-speech tags in-place. If set to #[code None], the default tagger is constructed from the #[code default_tagger()] method.
|
||||
|
||||
+param("parser")
|
||||
| The dependency parser, which should be a callable that accepts a #[code Doc] object, and sets the syntactic heads and dependency labels in-place. If set to #[code None], the default parser is constructed from the #[code default_parser()] method.
|
||||
|
||||
+param("entity")
|
||||
| The named entity recognizer, which should be a callable that accepts a #[code Doc] object, and sets the named entity annotations in-place. If set to #[code None], the default entity recognizer is constructed from the #[code default_entity()] method.
|
||||
|
||||
+param("matcher")
|
||||
| The pattern matcher, which should be a callable that accepts a #[code Doc] object, and sets annotations in-place. If set to #[code None], the default matcher is constructed from the #[code default_matcher()] method.
|
||||
|
||||
+method("__call__", "text, tag=True, parse=True, entity=True", "English-__call__")(open="true")
|
||||
p The main entry point to spaCy. Takes raw unicode text, and returns a #[code Doc] object, which can be iterated to access #[code Token] and #[code Span] objects. spaCy's models are all linear-time, so you can supply documents of arbitrary length, e.g. whole novels.
|
||||
|
||||
+params
|
||||
+param("text", types.unicode)
|
||||
| The text to be processed. spaCy expects raw unicode txt – you don't necessarily need to, say, split it into paragraphs. However, depending on your documents, you might be better off applying custom pre-processing. Non-text formatting, e.g. from HTML mark-up, should be removed before sending the document to spaCy. If your documents have a consistent format, you may be able to improve accuracy by pre-processing. For instance, if the first word of your documents are always in upper-case, it may be helpful to normalize them before supplying them to spaCy.
|
||||
|
||||
+param("tag", types.bool)
|
||||
| Whether to apply the part-of-speech tagger. Required for parsing and entity recognition.
|
||||
|
||||
+param("parse", types.bool)
|
||||
| Whether to apply the syntactic dependency parser.
|
||||
|
||||
+param("entity", types.bool)
|
||||
| Whether to apply the named entity recognizer.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| nlp = English()
|
||||
| doc = nlp(u'Some text.) # Applies tagger, parser, entity
|
||||
| doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
|
||||
| doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
|
||||
| doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
|
||||
| doc = nlp(u'') # Zero-length tokens, not an error
|
||||
| # doc = nlp(b'Some text') <-- Error: need unicode
|
||||
| doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
||||
|
||||
|
||||
+declare_class("Doc", "doc")
|
||||
p A sequence of #[code Token] objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary strings.
|
||||
|
||||
p Internally, the #[code Doc] object holds an array of #[code TokenC] structs. The Python-level #[code Token] and #[code Span] objects are views of this array, i.e. they don't own the data themselves. This details of the internals shouldn't matter for the API – but it may help you read the code, and understand how spaCy is designed.
|
||||
|
||||
+init
|
||||
+SeeAlso("English.__call__(unicode text)", "#English-__call__")
|
||||
|
||||
+method("__init__", "self, vocab, orth_and_spaces=None")(open=true)
|
||||
| This method of constructing a #[code Doc] object is usually only used for deserialization. Standard usage is to construct the document via a call to the language object.
|
||||
+params
|
||||
+param("vocab", vocab_type)
|
||||
| A Vocabulary object, which must match any models you want to use (e.g. tokenizer, parser, entity recognizer).
|
||||
+param("orth_and_spaces")
|
||||
| A list of #[code (orth_id, has_space)] tuples, where #[code orth_id] is an integer, and has_space is a boolean, indicating whether the token has a trailing space.
|
||||
|
||||
+sequence
|
||||
ul
|
||||
+Define("doc[i]")
|
||||
| Get the #[code Token] object at position #[code i], where #[code i] is an integer. Negative indexing is supported, and follows the usual Python semantics, i.e. #[code doc[-2]] is <code>doc[len(doc) - 2]</code>.
|
||||
|
||||
+Define("doc[start : end]")
|
||||
| Get a #[code Span] object, starting at position #[code start] and ending at position #[code end]. For instance, <code>doc[2:5]</code> produces a span consisting of tokens 2, 3 and 4. Stepped slices (e.g. <code>doc[start : end : step]</code>) are not supported, as #[code Span] objects must be contiguous (cannot have gaps).
|
||||
|
||||
+Define("for token in doc")
|
||||
| Iterate over #[code Token ] objects, from which the annotations can be easily accessed. This is the main way of accessing #[code Token] objects, which are the main way annotations are accessed from Python. If faster-than-Python speeds are required, you can instead access the annotations as a numpy array, or access the underlying C data directly from Cython, via #[code Doc.data], an array of #[code TokenC] structs. The C API has not yet been finalized, and is subject to change.
|
||||
|
||||
+Define("len(doc)")
|
||||
| The number of tokens in the document.
|
||||
|
||||
details
|
||||
summary: h4 Sentence, entity and noun chunk spans
|
||||
|
||||
+attribute("sents", types.generator)(open=true)
|
||||
| Yields sentence #[code Span] objects. Iterate over the span to get individual #[code Token] objects. Sentence spans have no label.
|
||||
pre.language-python
|
||||
code
|
||||
| >>> from spacy.en import English
|
||||
| >>> nlp = English()
|
||||
| >>> doc = nlp(u'This is a sentence. Here's another...')
|
||||
| >>> for sentence in doc.sents:
|
||||
| ... sentence.root.orth_
|
||||
| is
|
||||
| 's
|
||||
|
||||
|
||||
+attribute("ents", types.generator)(open=true)
|
||||
| Yields named-entity #[code Span] objects. Iterate over the span to get individual #[code Token] objects, or access the label:
|
||||
pre.language-python
|
||||
code
|
||||
| >>> from spacy.en import English
|
||||
| >>> nlp = English()
|
||||
| >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||
| >>> ents = list(tokens.ents)
|
||||
| >>> ents[0].label, ents[0].label_, ents[0].orth_, ents[0].string
|
||||
| (112504, 'PERSON', 'Best', ents[0].string)
|
||||
|
||||
+attribute("noun_chunks", types.generator)(open=true)
|
||||
| Yields base noun-phrase #[code Span ] objects. A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. For example:
|
||||
pre.language-python
|
||||
code
|
||||
| >>> from spacy.en import English
|
||||
| >>> nlp = English()
|
||||
| >>> doc = nlp('The sentence in this example has three noun chunks.')
|
||||
| >>> for chunk in doc.noun_chunks:
|
||||
| ... print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
|
||||
| NP The sentence <-- has
|
||||
| NP this example <-- in
|
||||
| NP three noun chunks <-- has
|
||||
|
||||
details
|
||||
summary: h4 Export/Import
|
||||
|
||||
+method("to_array", "attr_ids")(open=true)
|
||||
| Given a list of M attribute IDs, export the tokens to a numpy ndarray of shape N*M, where N is the length of the sentence.
|
||||
|
||||
+params
|
||||
+param("attr_ids", "list[int]")(open=true)
|
||||
| A list of attribute ID ints. Attribute IDs can be imported from
|
||||
code spacy.attrs
|
||||
|
||||
+method("count_by", "attr_id")(open=true)
|
||||
| Produce a dict of #[code {attribute (int): count (ints)}] frequencies, keyed by the values of the given attribute ID.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> from spacy.en import English, attrs
|
||||
| >>> nlp = English()
|
||||
| >>> tokens = nlp(u'apple apple orange banana')
|
||||
| >>> tokens.count_by(attrs.ORTH)
|
||||
| {12800L: 1, 11880L: 2, 7561L: 1}
|
||||
| >>> tokens.to_array([attrs.ORTH])
|
||||
| array([[11880],
|
||||
| [11880],
|
||||
| [7561],
|
||||
| [12800]])
|
||||
|
||||
+method("from_array", "attrs, array")(open=true)
|
||||
Write to a #[code Doc] object, from an M*N array of attributes.
|
||||
|
||||
+method("from_bytes", "byte_string")(open=true)
|
||||
| Deserialize, loading from bytes.
|
||||
|
||||
+method("to_bytes")(open=true)
|
||||
| Serialize, producing a byte string.
|
||||
|
||||
+method("read_bytes")(open=true)
|
||||
| A staticmethod, used to read serialized #[code Doc] objects from a file.
|
||||
| For example:
|
||||
pre.language-python
|
||||
code
|
||||
| for byte_string in Doc.read_bytes(open(location_of_bytes)):
|
||||
| doc = Doc(nlp.vocab).from_bytes(byte_string)
|
||||
|
||||
+declare_class("Token", "token")
|
||||
p A Token represents a single word, punctuation or significant whitespace symbol. Integer IDs are provided for all string features. The (unicode) string is provided by an attribute of the same name followed by an underscore, e.g. #[code token.orth] is an integer ID, #[code token.orth_] is the unicode value. The only exception is the Token.string attribute, which is (unicode) string-typed.
|
||||
details
|
||||
summary: h4 String Features
|
||||
|
||||
ul
|
||||
+Define("lemma / lemma_")
|
||||
| The "base" of the word, with no inflectional suffixes, e.g. the lemma of "developing" is "develop", the lemma of "geese" is "goose", etc. Note that #[em derivational] suffixes are not stripped, e.g. the lemma of "instutitions" is "institution", not "institute". Lemmatization is performed using the WordNet data, but extended to also cover closed-class words such as pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his". We assign pronouns the lemma #[code -PRON-].
|
||||
|
||||
+LexemeStrings()
|
||||
|
||||
details
|
||||
summary: h4 Boolean Flags
|
||||
+LexemeBooleans
|
||||
|
||||
+method("check_flag", "flag_id")(open=true)
|
||||
| Get the value of one of the boolean flags
|
||||
|
||||
details
|
||||
summary: h4 Distributional Features
|
||||
+LexemeDistributional()
|
||||
|
||||
details
|
||||
summary: h4 Alignment and Output
|
||||
|
||||
ul
|
||||
+Define("idx")
|
||||
| Start index of the token in the string
|
||||
|
||||
+Define("len(token)", "")
|
||||
| Length of the token's orth string, in unicode code-points.
|
||||
|
||||
+Define("unicode(token)", "")
|
||||
| Same as #[code token.orth_]
|
||||
|
||||
+Define("str(token)", "")
|
||||
| In Python 3, returns #[code token.orth_]. In Python 2, returns
|
||||
| #[code token.orth_.encode('utf8')]
|
||||
|
||||
+Define("text")
|
||||
| An alias for #[code token.orth_].
|
||||
|
||||
+Define("text_with_ws")
|
||||
| #[code token.orth_ + token.whitespace_], i.e. the form of the word as it appears in the string, #[including trailing whitespace]. This is useful when you need to use linguistic features to add inline mark-up to the string.
|
||||
|
||||
+Define("whitespace_")
|
||||
| The number of immediate syntactic children following the word in the string.
|
||||
|
||||
define
|
||||
summary: h4 Navigating the Parse Tree
|
||||
|
||||
ul
|
||||
+Define("head")
|
||||
| The immediate syntactic head of the token. If the token is the root of its sentence, it is the token itself, i.e. #[code root_token.head is root_token]
|
||||
|
||||
+Define("children")
|
||||
| An iterator that yields from lefts, and then yields from rights.
|
||||
|
||||
+Define("subtree")
|
||||
| An iterator for the part of the sentence syntactically governed by the word, including the word itself.
|
||||
|
||||
+Define("left_edge")
|
||||
| The leftmost edge of the token's subtree
|
||||
|
||||
+Define("right_edge")
|
||||
| The rightmost edge of the token's subtree
|
||||
|
||||
+method("nbor(i=1)")(open=true)
|
||||
| Get the #[em i]th next / previous neighboring token.
|
||||
|
||||
details
|
||||
summary: h4 Named Entities
|
||||
|
||||
ul
|
||||
+Define("ent_type")
|
||||
| If the token is part of an entity, its entity type.
|
||||
|
||||
+Define("ent_iob")
|
||||
| The IOB (inside, outside, begin) entity recognition tag for the token.
|
||||
|
||||
+init
|
||||
+method("__init__", "vocab, doc, offset")(open=true)
|
||||
+params
|
||||
+param("vocab", types.Vocab)
|
||||
| A Vocab object
|
||||
|
||||
+param("doc", types.Doc)
|
||||
| The parent sequence
|
||||
|
||||
+param("offset", types.int)
|
||||
| The index of the token within the document
|
||||
|
||||
//+attribute("conjuncts")
|
||||
// | Conjuncts
|
||||
|
||||
+declare_class("Span", "span")
|
||||
| A #[code Span] is a slice of a #[code Doc] object, consisting of zero or more tokens. Spans are used to represent sentences, named entities, phrases, and arbitrary contiguous slices from the #[code Doc] object. #[code Span] objects are views – that is, they do not copy the underlying C data. This makes them cheap to construct, as internally are simply a reference to the #[code Doc] object, a start position, an end position, and a label ID.
|
||||
|
||||
+Define("token = span[i]")
|
||||
| Get the #[code Token] object at position #[em i], where #[em i] is an offset within the #[code Span], not the document. That is:
|
||||
pre.language-python
|
||||
code
|
||||
| span = doc[4:6]
|
||||
| token = span[0]
|
||||
| assert token.i == 4
|
||||
|
||||
ul
|
||||
+Define("for token in span")
|
||||
| Iterate over the #[code Token] objects in the span.
|
||||
|
||||
+Define("__len__")
|
||||
| Number of tokens in the span.
|
||||
|
||||
+Define("text")
|
||||
| The text content of the span, obtained from #[code.language-python ''.join(token.text_with_ws for token in span)]
|
||||
|
||||
+Define("start")
|
||||
| The start offset of the span, i.e. #[code.language-python span[0].i].
|
||||
|
||||
+Define("end")
|
||||
| The end offset of the span, i.e. #[code.language-python span[-1].i + 1]
|
||||
|
||||
details
|
||||
summary: h4 Navigating the Parse Tree
|
||||
|
||||
+attribute("root")(open=true)
|
||||
| The first ancestor of the first word of the span that has its head outside the span. For example:
|
||||
pre.language-python
|
||||
code
|
||||
| >>> toks = nlp(u'I like New York in Autumn.')
|
||||
|
||||
p Let's name the indices --- easier than writing #[code toks[4]] etc.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> i, like, new, york, in_, autumn, dot = range(len(toks))
|
||||
|
||||
p The head of #[em new] is #[em York], and the head of #[em York] is #[em like]
|
||||
pre.language-python
|
||||
code
|
||||
| >>> toks[new].head.orth_
|
||||
| 'York'
|
||||
| >>> toks[york].head.orth_
|
||||
| 'like'
|
||||
|
||||
p Create a span for "New York". Its root is "York".
|
||||
pre.language-python
|
||||
code
|
||||
| >>> new_york = toks[new:york+1]
|
||||
| >>> new_york.root.orth_
|
||||
| 'York'
|
||||
|
||||
p When there are multiple words with external dependencies, we take the first:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> toks[autumn].head.orth_, toks[dot].head.orth_
|
||||
| ('in', like')
|
||||
| >>> autumn_dot = toks[autumn:]
|
||||
| >>> autumn_dot.root.orth_
|
||||
| 'Autumn'
|
||||
|
||||
+attribute("lefts")(open=true)
|
||||
| Tokens that are to the left of the span, whose head is within the span, i.e.
|
||||
code.language-python
|
||||
| lefts = [span.doc[i] for i in range(0, span.start)
|
||||
| if span.doc[i].head in span]
|
||||
|
||||
+attribute("rights")(open=true)
|
||||
| Tokens that are to the right of the span, whose head is within the span, i.e.
|
||||
code.language-python
|
||||
| rights = [span.doc[i] for i in range(span.end, len(span.doc))
|
||||
| if span.doc[i].head in span]
|
||||
|
||||
|
||||
+attribute("subtree")(open=true)
|
||||
| Tokens in the range #[code (start, end+1)], where #[code start] is the index of the leftmost word descended from a token in the span, and #[code end] is the index of the rightmost token descended from a token in the span.
|
||||
|
||||
+init
|
||||
ul
|
||||
+Define("doc[start : end]")
|
||||
+Define("for entity in doc.ents")
|
||||
+Define("for sentence in doc.sents")
|
||||
+Define("for noun_phrase in doc.noun_chunks")
|
||||
+Define("span = Span(doc, start, end, label=0)")
|
||||
|
||||
details
|
||||
summary: h4 Strings
|
||||
|
||||
ul
|
||||
+Define("text_with_ws")
|
||||
| The form of the span as it appears in the string, #[including trailing whitespace]. This is useful when you need to use linguistic features to add inline mark-up to the string.
|
||||
|
||||
+Define("lemma / lemma_")
|
||||
| Whitespace-concatenated lemmas of each token in the span.
|
||||
|
||||
+Define("label / label_")
|
||||
| The span label, used particularly for named entities.
|
||||
|
||||
+declare_class("Lexeme", "lexeme")
|
||||
p The Lexeme object represents a lexical type, stored in the vocabulary – as opposed to a token, occurring in a document.
|
||||
|
||||
p Each #[code Token] object receives a reference to a lexeme object (specifically, it receives a pointer to a #[code LexemeC] struct). This allows features to be computed and saved once per #[em type], rather than once per #[em token]. As job sizes grow, this amounts to substantial efficiency improvements, as the vocabulary size (number of types) will be much smaller than the total number of words processed (number of tokens).
|
||||
|
||||
p All Lexeme attributes are therefore context independent, as a single lexeme is reused for all usages of that word. Lexemes are keyed by the “orth” attribute.
|
||||
|
||||
p Most Lexeme attributes can be set, with the exception of the primary key, #[code orth]. Assigning to an attribute of the Lexeme object writes to the underlying struct, so all tokens that are backed by that Lexeme will inherit the new value.
|
||||
|
||||
details
|
||||
summary: h4 String Features
|
||||
+LexemeStrings
|
||||
|
||||
details
|
||||
summary: h4 Boolean Features
|
||||
+LexemeBooleans
|
||||
|
||||
details
|
||||
summary: h4 Distributional Features
|
||||
+LexemeDistributional
|
||||
|
||||
+init(open=true)
|
||||
ul
|
||||
+Define("lexeme = vocab[string]")
|
||||
+Define("lexeme = vocab[i]")
|
||||
|
||||
|
||||
|
||||
+declare_class("Vocab")
|
||||
ul
|
||||
+Define("lexeme = vocab[integer_id]")(open="true")
|
||||
| Get a lexeme by its orth ID
|
||||
|
||||
+Define("lexeme = vocab[string]")(open="true")
|
||||
| Get a lexeme by the string corresponding to its orth ID.
|
||||
|
||||
+Define("for lexeme in vocab")
|
||||
| Iterate over #[code Lexeme] objects
|
||||
|
||||
|
||||
+Define("vocab[integer_id] = attributes_dict")(open=true)
|
||||
| A props dictionary
|
||||
|
||||
+Define("len(vocab)")(open=true)
|
||||
| Number of lexemes (unique words) in the
|
||||
|
||||
+init(open=true)
|
||||
ul
|
||||
+Define("nlp.vocab")
|
||||
+Define("doc.vocab")
|
||||
+Define("span.vocab")
|
||||
+Define("token.vocab")
|
||||
+Define("lexeme.vocab")
|
||||
|
||||
details
|
||||
summary: h4 Save and Load
|
||||
|
||||
+method("dump", "loc")(open=true)
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
| Path where the vocabulary should be saved
|
||||
|
||||
+method("load_lexemes", "loc")(open=true)
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
| Path to load the lexemes.bin file from
|
||||
|
||||
+method("load_vectors", "file")(open=true)
|
||||
+params
|
||||
+param("file", types.unicode)
|
||||
| A file-like object, to load word vectors from.
|
||||
|
||||
+method("load_vectors_from_bin_loc", "loc")(open=true)
|
||||
+params
|
||||
+param("loc", types.unicode)
|
||||
| A path to a file, in spaCy's binary word-vectors file format.
|
||||
|
||||
|
||||
+declare_class("StringStore")
|
||||
p Intern strings, and map them to sequential integer IDs. The mapping table is very efficient , and a small-string optimization is used to maintain a small memory footprint. Only the integer IDs are held by spaCy's data classes (#[code Doc], #[code Token], #[code Span] and #[code Lexeme]) – when you use a string-valued attribute like #[code token.orth_], you access a property that computes #[code token.strings[token.orth]].
|
||||
ul
|
||||
+Define("string = string_store[int_id]")
|
||||
| Retrieve a string from a given integer ID. If the integer ID is not found, raise #[code IndexError]
|
||||
|
||||
+Define("int_id = string_store[unicode_string]")
|
||||
| Map a unicode string to an integer ID. If the string is previously unseen, it is interned, and a new ID is returned.
|
||||
|
||||
+Define("int_id = string_store[utf8_byte_string]")
|
||||
| Byte strings are assumed to be in UTF-8 encoding. Strings encoded with other codecs may fail silently. Given a utf8 string, the behaviour is the same as for unicode strings. Internally, strings are stored in UTF-8 format. So if you start with a UTF-8 byte string, it's less efficient to first decode it as unicode, as StringStore will then have to encode it as UTF-8 once again.
|
||||
+Define("n_strings = len(string_store)")
|
||||
| Number of strings in the string-store
|
||||
|
||||
+Define("for string in string_store")(open=true)
|
||||
| Iterate over strings in the string store, in order, such that the #[em i]th string in the sequence has the ID #[em i]:
|
||||
pre.language-python
|
||||
code
|
||||
| for i, string in enumerate(string_store):
|
||||
| assert i == string_store[string]
|
||||
|
||||
+init
|
||||
p #[code StringStore.__init__] takes no arguments, so a new instance can be constructed as follows:
|
||||
pre.language-python
|
||||
code
|
||||
| string_store = StringStore()
|
||||
|
||||
p However, in practice you'll usually use the instance owned by the language's #[code vocab] object, which all classes hold a reference to:
|
||||
|
||||
ul
|
||||
li #[code.language-python english.vocab.strings]
|
||||
li #[code.language-python doc.vocab.strings]
|
||||
li #[code.language-python span.vocab.strings]
|
||||
li #[code.language-python token.vocab.strings]
|
||||
li #[code.language-python lexeme.vocab.strings]
|
||||
|
||||
p If you create another instance, it will map strings to different integers – which is usually not what you want.
|
||||
|
||||
details
|
||||
summary: h4 Save and Load
|
||||
|
||||
+method("dump", "loc")(open=true)
|
||||
p Save the strings mapping to the given location, in plain text. The format is subject to change; so if you need to read/write compatible files, please can find details in the #[code strings.pyx] source.
|
||||
|
||||
+method("load", "loc")(open=true)
|
||||
p Load the strings mapping from a plain-text file in the given location. The format is subject to change; so if you need to read/write compatible files, please can find details in the #[code strings.pyx] source.
|
114
website/src/jade/docs/_spec.jade
Normal file
114
website/src/jade/docs/_spec.jade
Normal file
|
@ -0,0 +1,114 @@
|
|||
mixin columns(...names)
|
||||
tr
|
||||
each name in names
|
||||
th= name
|
||||
|
||||
|
||||
mixin row(...cells)
|
||||
tr
|
||||
each cell in cells
|
||||
td= cell
|
||||
|
||||
|
||||
mixin Define(term)
|
||||
li
|
||||
#[code #{term}]
|
||||
block
|
||||
|
||||
|
||||
details
|
||||
summary: h4 Overview
|
||||
|
||||
p This document describes the target annotations spaCy is trained to predict. This is currently a work in progress. Please ask questions on the issue tracker, so that the answers can be integrated here to improve the documentation.
|
||||
|
||||
details
|
||||
summary: h4 Tokenization
|
||||
|
||||
p Tokenization standards are based on the OntoNotes 5 corpus.
|
||||
|
||||
p The tokenizer differs from most by including tokens for significant whitespace. Any sequence of whitespace characters beyond a single space (' ') is included as a token. For instance:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from spacy.en import English
|
||||
| nlp = English(parse=False)
|
||||
| tokens = nlp('Some\nspaces and\ttab characters')
|
||||
| print([t.orth_ for t in tokens])
|
||||
|
||||
p Which produces:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters']
|
||||
|
||||
p The whitespace tokens are useful for much the same reason punctuation is – it's often an important delimiter in the text. By preserving it in the token output, we are able to maintain a simple alignment between the tokens and the original string, and we ensure that no information is lost during processing.
|
||||
|
||||
details
|
||||
summary: h4 Sentence boundary detection
|
||||
|
||||
p Sentence boundaries are calculated from the syntactic parse tree, so features such as punctuation and capitalisation play an important but non-decisive role in determining the sentence boundaries. Usually this means that the sentence boundaries will at least coincide with clause boundaries, even given poorly punctuated text.
|
||||
|
||||
details
|
||||
summary: h4 Part-of-speech Tagging
|
||||
|
||||
p The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set. We also map the tags to the simpler Google Universal POS Tag set.
|
||||
|
||||
p Details #[a(href="https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124") here].
|
||||
|
||||
details
|
||||
summary: h4 Lemmatization
|
||||
|
||||
p.
|
||||
A "lemma" is the uninflected form of a word. In English, this means:
|
||||
|
||||
ul
|
||||
li Adjectives: The form like "happy", not "happier" or "happiest"
|
||||
li Adverbs: The form like "badly", not "worse" or "worst"
|
||||
li Nouns: The form like "dog", not "dogs"; like "child", not "children"
|
||||
li Verbs: The form like "write", not "writes", "writing", "wrote" or "written"
|
||||
|
||||
p.
|
||||
The lemmatization data is taken from WordNet. However, we also add a
|
||||
special case for pronouns: all pronouns are lemmatized to the special
|
||||
token #[code -PRON-].
|
||||
|
||||
|
||||
details
|
||||
summary: h4 Syntactic Dependency Parsing
|
||||
|
||||
p The parser is trained on data produced by the ClearNLP converter. Details of the annotation scheme can be found #[a(href="http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf") here].
|
||||
|
||||
details
|
||||
summary: h4 Named Entity Recognition
|
||||
|
||||
table
|
||||
thead
|
||||
+columns("Entity Type", "Description")
|
||||
|
||||
tbody
|
||||
+row("PERSON", "People, including fictional.")
|
||||
+row("NORP", "Nationalities or religious or political groups.")
|
||||
+row("FACILITY", "Buildings, airports, highways, bridges, etc.")
|
||||
+row("ORG", "Companies, agencies, institutions, etc.")
|
||||
+row("GPE", "Countries, cities, states.")
|
||||
+row("LOC", "Non-GPE locations, mountain ranges, bodies of water.")
|
||||
+row("PRODUCT", "Vehicles, weapons, foods, etc. (Not services")
|
||||
+row("EVENT", "Named hurricanes, battles, wars, sports events, etc.")
|
||||
+row("WORK_OF_ART", "Titles of books, songs, etc.")
|
||||
+row("LAW", "Named documents made into laws")
|
||||
+row("LANGUAGE", "Any named language")
|
||||
|
||||
p The following values are also annotated in a style similar to names:
|
||||
|
||||
table
|
||||
thead
|
||||
+columns("Entity Type", "Description")
|
||||
|
||||
tbody
|
||||
+row("DATE", "Absolute or relative dates or periods")
|
||||
+row("TIME", "Times smaller than a day")
|
||||
+row("PERCENT", 'Percentage (including “%”)')
|
||||
+row("MONEY", "Monetary values, including unit")
|
||||
+row("QUANTITY", "Measurements, as of weight or distance")
|
||||
+row("ORDINAL", 'first", "second"')
|
||||
+row("CARDINAL", "Numerals that do not fall under another type")
|
28
website/src/jade/docs/index.jade
Normal file
28
website/src/jade/docs/index.jade
Normal file
|
@ -0,0 +1,28 @@
|
|||
include ../mixins.jade
|
||||
include ../header.jade
|
||||
|
||||
|
||||
- var Page = InitPage(Site, Authors.spacy, 'docs', 'Docs')
|
||||
|
||||
|
||||
+WritePage(Site, Authors.spacy, Page)
|
||||
section.intro
|
||||
p #[strong spaCy] consists of a vocabulary table that stores lexical types, a pipeline that produce annotations, and three classes to manipulate document, span and token data. The annotations are predicted using statistical models, according to specifications that follow common practice in the research community.
|
||||
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a.button(href="#api") API
|
||||
li: a.button(href="#tutorials") Tutorials
|
||||
li: a.button(href="#spec") Spec
|
||||
|
||||
article
|
||||
+Section("API", "api", "_api.jade")
|
||||
|
||||
section.intro
|
||||
h2 #[a.permalink(href='#tutorials', name='tutorials') Tutorials]
|
||||
|
||||
section.tutorials
|
||||
include ../tutorials/_teaser.jade
|
||||
|
||||
article
|
||||
+Section("Annotation Specifications", "spec", "_spec.jade")
|
193
website/src/jade/header.jade
Normal file
193
website/src/jade/header.jade
Normal file
|
@ -0,0 +1,193 @@
|
|||
- Site = {}
|
||||
- Site.name = "spaCy.io"
|
||||
- Site.slogan = "Build Tomorrow's Language Technologies"
|
||||
- Site.description = "spaCy is a library for industrial-strength text processing in Python. If you're a small company doing NLP, we want spaCy to seem like a minor miracle."
|
||||
- Site.image = "http://spacy.io/resources/img/social.png"
|
||||
- Site.image_small = "http://spacy.io/resources/img/social_small.png"
|
||||
- Site.twitter = "spacy_io"
|
||||
- Site.url = "http://spacy.io"
|
||||
-
|
||||
- Authors = {"matt": {}, "spacy": {}};
|
||||
- Authors.matt.name = "Matthew Honnibal"
|
||||
- Authors.matt.bio = "Matthew Honnibal is the author of the <a href=\"http://spacy.io\">spaCy</a> software and the sole founder of its parent company. He studied linguistics as an undergrad, and never thought he'd be a programmer. By 2009 he had a PhD in computer science, and in 2014 he left academia to found Syllogism Co. He's from Sydney and lives in Berlin."
|
||||
|
||||
- Authors.matt.image = "/resources/img/matt.png"
|
||||
- Authors.matt.twitter = "honnibal"
|
||||
-
|
||||
- Authors.spacy.name = "SpaCy.io"
|
||||
- Authors.spacy.bio = "<a href=\"http://spacy.io\">spaCy</a> is a library for industrial-strength natural language processing in Python and Cython. It features state-of-the-art speed and accuracy, a concise API, and great documentation. If you're a small company doing NLP, we want spaCy to seem like a minor miracle."
|
||||
- Authors.spacy.image = "/resources/img/social_small.png"
|
||||
- Authors.spacy.twitter = "spacy_io"
|
||||
|
||||
- function InitPage(Site, Author, type, headline) {
|
||||
- var Page = {};
|
||||
- Page.headline = headline;
|
||||
- Page.type = type;
|
||||
- Page.active = {};
|
||||
- Page.active[type] = true;
|
||||
- Page.links = [];
|
||||
- if (type == "home") {
|
||||
- Page.url = "";
|
||||
- } else {
|
||||
- Page.url = "/" + type;
|
||||
- }
|
||||
-
|
||||
- // Set defaults
|
||||
- Page.description = Site.description;
|
||||
- Page.image = Site.image;
|
||||
- Page.image_small = Site.image_small;
|
||||
-
|
||||
- // Compute titles
|
||||
- if (type == "blog") {
|
||||
- Page.title = headline;
|
||||
- Page.title_full = Page.title + " | " + Site.name;
|
||||
- } else if (type == "home") {
|
||||
- Page.title = Site.name + ' | ' + headline;
|
||||
- Page.title_full = Page.title;
|
||||
- } else {
|
||||
- Page.title = headline + " | " + Site.name;
|
||||
- Page.title_full = headline + " | " + Site.name;
|
||||
- }
|
||||
- return Page;
|
||||
- }
|
||||
|
||||
- function InitPost(Site, Author, Meta) {
|
||||
- var Page = InitPage(Site, Author, "blog", Meta.headline)
|
||||
- Page.headline = Meta.headline
|
||||
- Page.description = Meta.description
|
||||
- Page.date = Meta.date
|
||||
- Page.url = Meta.url
|
||||
- Page.active["blog"] = true
|
||||
- Page.links = Meta.links
|
||||
- if (Meta.image != null) {
|
||||
- Page.image = Meta.image
|
||||
- } else {
|
||||
- Page.image = Site.image
|
||||
- }
|
||||
- if (Meta.image_small != null) {
|
||||
- Page.image_small = Meta.image_small
|
||||
- } else {
|
||||
- Page.image_small = Site.image_small
|
||||
- }
|
||||
- return Page;
|
||||
- }
|
||||
|
||||
mixin WritePage(Site, Author, Page)
|
||||
doctype html
|
||||
html(lang="en")
|
||||
head
|
||||
title= Page.title_full
|
||||
|
||||
meta(charset="utf-8")
|
||||
meta(name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no")
|
||||
meta(name="description" content=Page.description)
|
||||
meta(itemporop="name" content=Page.title)
|
||||
meta(itemprop="description" content=Page.description)
|
||||
meta(itemprop="image" content=Page.image)
|
||||
meta(name="twitter:card" content="summary")
|
||||
meta(name="twitter:site" content=Site.twitter)
|
||||
meta(name="twitter:title" content=Page.title)
|
||||
meta(name="twitter:description" content=Page.description)
|
||||
meta(name="twitter:creator" content="@" + Author.twitter)
|
||||
meta(name="twitter:image" content=Page.image_small)
|
||||
meta(property="og:title" content=Page.title)
|
||||
meta(property="og:type" content="article")
|
||||
meta(property="og:url" content=Site.url + Page.url)
|
||||
meta(property="og:image" content=Page.image)
|
||||
meta(property="og:description" content=Page.description)
|
||||
meta(property="og:site_name" content=Site.name)
|
||||
meta(property="article:published_time" content=getDate(Page.date).timestamp)
|
||||
link(rel="stylesheet" href="/resources/css/style.css")
|
||||
|
||||
//[if lt IE 9]><script src="http://html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]
|
||||
|
||||
body(id=Page.type)
|
||||
header(role="banner")
|
||||
h1.logo !{Site.name}
|
||||
div.slogan
|
||||
if Page.is_error
|
||||
| #{Site.slogan}
|
||||
else if Page.type == "home"
|
||||
| #{Site.slogan}
|
||||
else
|
||||
| #{Page.type.charAt(0).toUpperCase() + Page.type.slice(1)}
|
||||
nav(role="navigation")
|
||||
li(class={active: Page.active.home}): a(href="/") Home
|
||||
li(class={active: Page.active.docs}): a(href="/docs") Docs
|
||||
li: a(href="/displacy", target="_blank") Demo
|
||||
li(class={active: Page.active.license}): a(href="/license") License
|
||||
li(class={active: Page.active.blog}): a(href="/blog") Blog
|
||||
main#content
|
||||
block
|
||||
script(src="/resources/js/prism.min.js")
|
||||
// Details polyfill
|
||||
script
|
||||
| var details = document.getElementsByTagName("details");
|
||||
| var summary = document.getElementsByTagName("summary");
|
||||
| for(var i = 0; i < details.length; i++) {
|
||||
| (details[i].getAttribute("open") == null) ? details[i].setAttribute("data-open", "false") : details[i].setAttribute("data-open", "true");
|
||||
| }
|
||||
| for(var i = 0; i < summary.length; i++) {
|
||||
| summary[i].addEventListener( "click", function(e) {
|
||||
| var parent = this.parentElement;
|
||||
| (parent.getAttribute("data-open") == "false") ? parent.setAttribute("data-open", "true") : parent.setAttribute("data-open", "false");
|
||||
| });
|
||||
| }
|
||||
script
|
||||
| (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
|
||||
| (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
|
||||
| m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
|
||||
| })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
|
||||
| ga('create', 'UA-58931649-1', 'auto');
|
||||
| ga('send', 'pageview');
|
||||
footer(role="contentinfo")
|
||||
span.slogan.copyright
|
||||
| © 2015 Syllogism Co. | #[a(href="mailto:contact@spacy.io") Contact]
|
||||
|
||||
mixin WritePost(Meta)
|
||||
- var Author = Authors[Meta.author_id]
|
||||
- var Page = InitPost(Site, Author, Meta)
|
||||
+WritePage(Site, Author, Page)
|
||||
article.post
|
||||
header
|
||||
h2: strike= Meta.struck_headline
|
||||
h2= Meta.headline
|
||||
+WriteByline(Author, Meta)
|
||||
block
|
||||
footer.meta(role="contentinfo")
|
||||
+WriteShareLinks(Meta.headline, Meta.url, Site.twitter, Meta.links)
|
||||
+WriteAuthorBio(Author)
|
||||
|
||||
mixin WriteByline(Author, Meta)
|
||||
.subhead by #[a(href="//twitter.com/" + Author.twitter, rel="author" target="_blank") #{Author.name}] on #[time #{getDate(Meta.date).fulldate}]
|
||||
|
||||
mixin WriteShareLinks(headline, url, twitter, links)
|
||||
a.button.button-twitter(href="http://twitter.com/share?text=" + headline + "&url=" + Site.url + url + "&via=" + twitter title="Share on Twitter" target="_blank")
|
||||
| Share on Twitter
|
||||
if links
|
||||
.discuss
|
||||
for link in links
|
||||
|
|
||||
a(class="button button-#{link.id}", target="_blank" href=link.url, title="Discuss on " + link.name)
|
||||
if link.title
|
||||
| #{link.title}
|
||||
else
|
||||
| Discuss on #{link.name}
|
||||
|
||||
mixin TweetThis(text, url)
|
||||
p #[span #{text} #[a.share(href='http://twitter.com/share?text="' + text + '"&url=' + Site.url + url + '&via=' + Site.twitter title='Share on Twitter' target='_blank') Tweet]]
|
||||
|
||||
mixin WriteAuthorBio(Author)
|
||||
section.intro.profile
|
||||
p #[img(src=Author.image)] !{Author.bio} #[span.social #[a(href="//twitter.com/" + Author.twitter target="_blank") Twitter]]
|
||||
|
||||
|
||||
- var getDate = function(input) {
|
||||
- var months = [ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December" ];
|
||||
- var dates = [];
|
||||
|
||||
- var date = new Date(input);
|
||||
- dates.fulldate = months[date.getMonth()] + " " + date.getDate() + ", " + date.getFullYear();
|
||||
- dates.timestamp = JSON.parse(JSON.stringify(date));
|
||||
- return dates;
|
||||
- }
|
151
website/src/jade/home/_comparisons.jade
Normal file
151
website/src/jade/home/_comparisons.jade
Normal file
|
@ -0,0 +1,151 @@
|
|||
|
||||
mixin columns(...names)
|
||||
tr
|
||||
each name in names
|
||||
th= name
|
||||
|
||||
|
||||
mixin row(...cells)
|
||||
tr
|
||||
each cell in cells
|
||||
td= cell
|
||||
|
||||
|
||||
mixin comparison(name)
|
||||
details
|
||||
summary
|
||||
h4 #{name}
|
||||
|
||||
block
|
||||
|
||||
|
||||
+comparison("Peer-reviewed Evaluations")(open=true)
|
||||
|
||||
p spaCy is committed to rigorous evaluation under standard methodology. Two papers in 2015 confirm that:
|
||||
|
||||
ol
|
||||
li spaCy is the fastest syntactic parser in the world;
|
||||
li Its accuracy is within 1% of the best available;
|
||||
li The few systems that are more accurate are 20× slower or more.
|
||||
|
||||
p spaCy v0.84 was evaluated by researchers at Yahoo! Labs and Emory University, as part of a survey paper benchmarking the current state-of-the-art dependency parsers (#[a(href="http://aclweb.org/anthology/P/P15/P15-1038.pdf") Choi et al., 2015]).
|
||||
|
||||
table
|
||||
thead
|
||||
+columns("System", "Language", "Accuracy", "Speed")
|
||||
|
||||
tbody
|
||||
+row("spaCy v0.84", "Cython", "90.6", "13,963")
|
||||
+row("spaCy v0.89", "Cython", "91.8", "13,000 (est.)")
|
||||
+row("ClearNLP", "Java", "91.7", "10,271")
|
||||
+row("CoreNLP", "Java", "89.6", "8,602")
|
||||
+row("MATE", "Java", "92.5", "550")
|
||||
+row("Turbo", "C++", "92.4", "349")
|
||||
+row("Yara", "Java", "92.3", "340")
|
||||
|
||||
p Discussion with the authors led to accuracy improvements in spaCy, which have been accepted for publication in EMNLP, in joint work with Macquarie University (#[a(href="honnibal_johnson_emnlp2015.pdf") Honnibal and Johnson, 2015]).
|
||||
|
||||
|
||||
+comparison("How does spaCy compare to NLTK?")
|
||||
.columnar
|
||||
.col
|
||||
h5 spaCy
|
||||
ul
|
||||
li.pro Over 400 times faster
|
||||
li.pro State-of-the-art accuracy
|
||||
li.pro Tokenizer maintains alignment
|
||||
li.pro Powerful, concise API
|
||||
li.pro Integrated word vectors
|
||||
li.con English only (at present)
|
||||
.col
|
||||
h5 NLTK
|
||||
ul
|
||||
li.con Slow
|
||||
li.con Low accuracy
|
||||
li.con Tokens do not align to original string
|
||||
li.con Models return lists of strings
|
||||
li.con No word vector support
|
||||
li.pro Multiple languages
|
||||
|
||||
|
||||
+comparison("How does spaCy compare to CoreNLP?")
|
||||
.columnar
|
||||
.col
|
||||
h5 spaCy
|
||||
ul
|
||||
li.pro 50% faster
|
||||
li.pro More accurate parser
|
||||
li.pro Word vectors integration
|
||||
li.pro Minimalist design
|
||||
li.pro Great documentation
|
||||
li.con English only
|
||||
li.pro Python
|
||||
.col
|
||||
h5 CoreNLP features:
|
||||
ul
|
||||
li.pro More accurate NER
|
||||
li.pro Coreference resolution
|
||||
li.pro Sentiment analysis
|
||||
li.con Little documentation
|
||||
li.pro Multiple languages
|
||||
li.neutral Java
|
||||
|
||||
+comparison("How does spaCy compare to ClearNLP?")
|
||||
|
||||
.columnar
|
||||
|
||||
.col
|
||||
h5 spaCy
|
||||
ul
|
||||
li.pro 30% faster
|
||||
li.pro Well documented
|
||||
li.con English only
|
||||
li.neutral Equivalent accuracy
|
||||
li.pro Python
|
||||
|
||||
.col
|
||||
h5 ClearNLP:
|
||||
ul
|
||||
li.pro Semantic Role Labelling
|
||||
li.pro Model for biology/life-science
|
||||
li.pro Multiple Languages
|
||||
li.neutral Equivalent accuracy
|
||||
li.neutral Java
|
||||
|
||||
//-+comparison("Accuracy Summary")
|
||||
//-+comparison("Speed Summary")
|
||||
//- table
|
||||
//- thead
|
||||
//- tr
|
||||
//- th.
|
||||
//- th(colspan=3) Absolute (ms per doc)
|
||||
//- th(colspan=3) Relative (to spaCy)
|
||||
//-
|
||||
//- tbody
|
||||
//- tr
|
||||
//- td: strong System
|
||||
//- td: strong Split
|
||||
//- td: strong Tag
|
||||
//- td: strong Parse
|
||||
//- td: strong Split
|
||||
//- td: strong Tag
|
||||
//- td: strong Parse
|
||||
//-
|
||||
//- +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
|
||||
//- +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
|
||||
//- +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x")
|
||||
//- +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x")
|
||||
//- +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a")
|
||||
//-
|
||||
//- p
|
||||
//- | <strong>Set up</strong>: 100,000 plain-text documents were streamed
|
||||
//- | from an SQLite3 database, and processed with an NLP library, to one
|
||||
//- | of three levels of detail – tokenization, tagging, or parsing.
|
||||
//- | The tasks are additive: to parse the text you have to tokenize and
|
||||
//- | tag it. The pre-processing was not subtracted from the times –
|
||||
//- | I report the time required for the pipeline to complete. I report
|
||||
//- | mean times per document, in milliseconds.
|
||||
//-
|
||||
//- p
|
||||
//- | <strong>Hardware</strong>: Intel i7-3770 (2012)
|
||||
|
200
website/src/jade/home/_installation.jade
Normal file
200
website/src/jade/home/_installation.jade
Normal file
|
@ -0,0 +1,200 @@
|
|||
mixin Option(name, open)
|
||||
details(open=open)
|
||||
summary
|
||||
h4= name
|
||||
block
|
||||
|
||||
+Option("Updating your installation")
|
||||
| To update your installation:
|
||||
|
||||
pre.language-bash
|
||||
code
|
||||
$ pip install --upgrade spacy
|
||||
$ python -m spacy.en.download all
|
||||
p Most updates ship a new model, so you will usually have to redownload the data.
|
||||
|
||||
|
||||
|
||||
+Option("conda", true)
|
||||
pre.language-bash: code
|
||||
| $ conda install spacy
|
||||
| $ python -m spacy.en.download all
|
||||
|
||||
+Option("pip and virtualenv", true)
|
||||
p With Python 2.7 or Python 3, using Linux or OSX, ensure that you have the following packages installed:
|
||||
|
||||
pre.language-bash: code
|
||||
| build-essential python-dev
|
||||
|
||||
p Then run:
|
||||
|
||||
pre.language-bash: code
|
||||
| $ pip install spacy
|
||||
| $ python -m spacy.en.download all
|
||||
|
||||
p
|
||||
| The download command fetches and installs about 400mb of data, for
|
||||
| the parser model and word vectors, which it installs within the spacy.en
|
||||
| package directory.
|
||||
|
||||
|
||||
+Option("Workaround for obsolete system Python", false)
|
||||
p
|
||||
| If you're stuck using a server with an old version of Python, and you
|
||||
| don't have root access, I've prepared a bootstrap script to help you
|
||||
| compile a local Python install. Run:
|
||||
|
||||
pre.language-bash: code
|
||||
| $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
|
||||
|
||||
|
||||
+Option("Compile from source", false)
|
||||
p
|
||||
| The other way to install the package is to clone the github repository,
|
||||
| and build it from source. This installs an additional dependency,
|
||||
| Cython. If you're using Python 2, I also recommend installing fabric
|
||||
| and fabtools – this is how I build the project.
|
||||
|
|
||||
| Ensure that you have the following packages installed:
|
||||
|
||||
pre.language-bash: code
|
||||
| build-essential python-dev git python-virtualenv
|
||||
|
||||
pre.language-bash: code
|
||||
| $ git clone https://github.com/honnibal/spaCy.git
|
||||
| $ cd spaCy
|
||||
| $ virtualenv .env && source .env/bin/activate
|
||||
| $ export PYTHONPATH=`pwd`
|
||||
| $ pip install -r requirements.txt
|
||||
| $ python setup.py build_ext --inplace
|
||||
| $ python -m spacy.en.download
|
||||
| $ pip install pytest
|
||||
| $ py.test tests/
|
||||
|
||||
p
|
||||
| Python packaging is awkward at the best of times, and it's particularly tricky
|
||||
| with C extensions, built via Cython, requiring large data files. So,
|
||||
| please report issues as you encounter them.
|
||||
|
||||
+Option("pypy (Unsupported)")
|
||||
| If PyPy support is a priority for you, please get in touch. We could likely
|
||||
| fix the remaining issues, if necessary. However, the library is likely to
|
||||
| be much slower on PyPy, as it's written in Cython, which produces code tuned
|
||||
| for the performance of CPython.
|
||||
|
||||
+Option("Windows (Unsupported)")
|
||||
| Unfortunately we don't currently support Windows.
|
||||
|
||||
h4 What's New?
|
||||
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-09-21 v0.92: Bug fixes to word vectors. Rename .repvec to .vector. Rename .string attribute.
|
||||
|
||||
ul
|
||||
li Bug fixes for word vectors.
|
||||
li The attribute to access word vectors was formerly named #[code token.repvec]. This has been renamed #[code token.vector]. The #[code .repvec] name is now deprecated. It will remain available until the next version.
|
||||
li Add #[code .vector] attributes to #[code Doc] and #[code Span] objects, which gives the average of their word vectors.
|
||||
li Add a #[code .similarity] method to #[code Token], #[code Doc], #[code Span] and #[code Lexeme] objects, that performs cosine similarity over word vectors.
|
||||
li The attribute #[code .string], which gave a whitespace-padded string representation, is now renamed #[code .text_with_ws]. A new #[code .text] attribute has been added. The #[code .string] attribute is now deprecated. It will remain available until the next version.
|
||||
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-09-15 v0.90: Refactor to allow multi-lingual, better customization.
|
||||
|
||||
ul
|
||||
li Move code out of #[code spacy.en] module, to allow new languages to be added more easily.
|
||||
li New #[code Matcher] class, to support token-based expressions, for custom named entity rules.
|
||||
li Users can now write to #[code Lexeme] objects, to set their own flags and string attributes. Values set on the vocabulary are inherited by tokens. This is especially effective in combination with the rule logic.
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-07-29 v0.89: Fix Spans, efficiency
|
||||
|
||||
ul
|
||||
li Fix regression in parse times on very long texts. Recent versions were calculating parse features in a way that was polynomial in input length.
|
||||
li Add tag SP (coarse tag SPACE) for whitespace tokens. Ensure entity recogniser does not assign entities to whitespace.
|
||||
li Rename #[code Span.head] to #[code Span.root], fix its documentation, and make it more efficient.
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-07-08 v0.88: Refactoring release.
|
||||
|
||||
ul
|
||||
li If you have the data for v0.87, you don't need to redownload the data for this release.
|
||||
li You can now set #[code tag=False], #[code parse=False] or #[code entity=False] when creating the pipleine, to disable some of the models. See the documentation for details.
|
||||
li Models no longer lazy-loaded.
|
||||
li Warning emitted when parse=True or entity=True but model not loaded.
|
||||
li Rename the tokens.Tokens class to tokens.Doc. An alias has been made to assist backwards compatibility, but you should update your code to refer to the new class name.
|
||||
li Various bits of internal refactoring
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-07-01 v0.87: Memory use
|
||||
|
||||
ul
|
||||
li Changed weights data structure. Memory use should be reduced 30-40%. Fixed speed regressions introduced in the last few versions.
|
||||
li Models should now be slightly more robust to noise in the input text, as I'm now training on data with a small amount of noise added, e.g. I randomly corrupt capitalization, swap spaces for newlines, etc. This is bringing a small benefit on out-of-domain data. I think this strategy could yield better results with a better noise-generation function. If you think you have a good way to make clean text resemble the kind of noisy input you're seeing in your domain, get in touch.
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-06-24 v0.86: Parser accuracy
|
||||
|
||||
p Parser now more accurate, using novel non-monotonic transition system that's currently under review.
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-05-12 v0.85: More diverse training data
|
||||
|
||||
ul
|
||||
li Parser produces richer dependency labels following the `ClearNLP scheme`_
|
||||
li Training data now includes text from a variety of genres.
|
||||
li Parser now uses more memory and the data is slightly larger, due to the additional labels. Impact on efficiency is minimal: entire process still takes <10ms per document.
|
||||
|
||||
p Most users should see a substantial increase in accuracy from the new model.
|
||||
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-05-12 v0.84: Bug fixes
|
||||
|
||||
ul
|
||||
li Bug fixes for parsing
|
||||
li Bug fixes for named entity recognition
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-04-13 v0.80
|
||||
|
||||
p Preliminary support for named-entity recognition. Its accuracy is substantially behind the state-of-the-art. I'm working on improvements.
|
||||
|
||||
ul
|
||||
li Better sentence boundary detection, drawn from the syntactic structure.
|
||||
li Lots of bug fixes.
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-03-05: v0.70
|
||||
|
||||
ul
|
||||
li Improved parse navigation API
|
||||
li Bug fixes to labelled parsing
|
||||
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-01-30: v0.4
|
||||
ul
|
||||
li Train statistical models on running text running text
|
||||
|
||||
details
|
||||
summary
|
||||
h4 2015-01-25: v0.33
|
||||
ul
|
||||
li Alpha release
|
||||
|
||||
|
||||
|
||||
|
83
website/src/jade/home/_installation.jade~
Normal file
83
website/src/jade/home/_installation.jade~
Normal file
|
@ -0,0 +1,83 @@
|
|||
mixin Option(name, open)
|
||||
details(open=open)
|
||||
summary
|
||||
h4= name
|
||||
block
|
||||
|
||||
article.post
|
||||
header
|
||||
h2 #[a(href=Meta.url)
|
||||
|
||||
p What's new in v0.90?
|
||||
|
||||
.subhead by #[a(href="//twitter.com/spacy_io", rel="author" target="_blank") #{spaCy}] on #[time #{getDate(Meta.date).fulldate}]
|
||||
|
||||
ul
|
||||
li Support for gazetteers
|
||||
li Set Lexeme attributes
|
||||
#[a.readmore(href=Meta.url) Full Change Log ►]
|
||||
|
||||
|
||||
section.intro
|
||||
p What's
|
||||
|
||||
+Option("conda", true)
|
||||
pre.language-bash: code
|
||||
| $ conda install spacy
|
||||
| $ python -m spacy.en.download
|
||||
|
||||
+Option("pip and virtualenv", true)
|
||||
p With Python 2.7 or Python 3, using Linux or OSX, run:
|
||||
|
||||
pre.language-bash: code
|
||||
| $ pip install spacy
|
||||
| $ python -m spacy.en.download
|
||||
|
||||
p
|
||||
| The download command fetches and installs about 300mb of data, for
|
||||
| the parser model and word vectors, which it installs within the spacy.en
|
||||
| package directory.
|
||||
|
||||
|
||||
+Option("Workaround for obsolete system Python", false)
|
||||
p
|
||||
| If you're stuck using a server with an old version of Python, and you
|
||||
| don't have root access, I've prepared a bootstrap script to help you
|
||||
| compile a local Python install. Run:
|
||||
|
||||
pre.language-bash: code
|
||||
| $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
|
||||
|
||||
|
||||
|
||||
+Option("Compile from source", false)
|
||||
p
|
||||
| The other way to install the package is to clone the github repository,
|
||||
| and build it from source. This installs an additional dependency,
|
||||
| Cython. If you're using Python 2, I also recommend installing fabric
|
||||
| and fabtools – this is how I build the project.
|
||||
|
||||
pre.language-bash: code
|
||||
| $ git clone https://github.com/honnibal/spaCy.git
|
||||
| $ cd spaCy
|
||||
| $ virtualenv .env && source .env/bin/activate
|
||||
| $ export PYTHONPATH=`pwd`
|
||||
| $ pip install -r requirements.txt
|
||||
| $ python setup.py build_ext --inplace
|
||||
| $ python -m spacy.en.download
|
||||
| $ pip install pytest
|
||||
| $ py.test tests/
|
||||
|
||||
p
|
||||
| Python packaging is awkward at the best of times, and it's particularly tricky
|
||||
| with C extensions, built via Cython, requiring large data files. So,
|
||||
| please report issues as you encounter them.
|
||||
|
||||
+Option("pypy (Unsupported)")
|
||||
| If PyPy support is a priority for you, please get in touch. We could likely
|
||||
| fix the remaining issues, if necessary. However, the library is likely to
|
||||
| be much slower on PyPy, as it's written in Cython, which produces code tuned
|
||||
| for the performance of CPython.
|
||||
|
||||
+Option("Windows (Unsupported)")
|
||||
| Unfortunately we don't currently support Windows.
|
22
website/src/jade/home/_online_demo.jade
Normal file
22
website/src/jade/home/_online_demo.jade
Normal file
|
@ -0,0 +1,22 @@
|
|||
mixin Displacy(sentence, caption_text, height)
|
||||
- var url = "/displacy/?full=" + sentence.replace(" ", "%20")
|
||||
|
||||
.displacy
|
||||
iframe.displacy(src="/resources/displacy/displacy_demo.html" height=height)
|
||||
|
||||
a.view-displacy(href=url, target="_blank")
|
||||
| Interactive Visualizer
|
||||
|
||||
p.caption.
|
||||
#{caption_text}
|
||||
|
||||
|
||||
+Displacy(
|
||||
"Click the button to see this sentence in displaCy.",
|
||||
"The best parse-tree visualizer and annotation tool in all the land.",
|
||||
275
|
||||
)
|
||||
|
||||
p #[a(href="/displacy") displaCy] lets you peek inside spaCy's syntactic parser, as it reads a sentence word-by-word. By repeatedly choosing from a small set of actions, it links the words together according to their syntactic structure. This type of representation powers a wide range of technologies, from translation and summarization, to sentiment analysis and algorithmic trading. #[a(href="/blog/displacy") Read more.]
|
||||
|
||||
|
162
website/src/jade/home/_usage.jade
Normal file
162
website/src/jade/home/_usage.jade
Normal file
|
@ -0,0 +1,162 @@
|
|||
mixin example(name)
|
||||
details
|
||||
summary
|
||||
h4= name
|
||||
block
|
||||
|
||||
|
||||
+example("Load resources and process text")
|
||||
pre.language-python: code
|
||||
| from __future__ import unicode_literals, print_function
|
||||
| from spacy.en import English
|
||||
| nlp = English()
|
||||
| doc = nlp('Hello, world. Here are two sentences.')
|
||||
|
||||
+example("Get tokens and sentences")
|
||||
pre.language-python: code
|
||||
| token = doc[0]
|
||||
| sentence = doc.sents.next()
|
||||
| assert token is sentence[0]
|
||||
| assert sentence.text == 'Hello, world.'
|
||||
|
||||
+example("Use integer IDs for any string")
|
||||
pre.language-python: code
|
||||
| hello_id = nlp.vocab.strings['Hello']
|
||||
| hello_str = nlp.vocab.strings[hello_id]
|
||||
|
|
||||
| assert token.orth == hello_id == 469755
|
||||
| assert token.orth_ == hello_str == 'Hello'
|
||||
|
||||
+example("Get and set string views and flags")
|
||||
pre.language-python: code
|
||||
| assert token.shape_ == 'Xxxxx'
|
||||
| for lexeme in nlp.vocab:
|
||||
| if lexeme.is_alpha:
|
||||
| lexeme.shape_ = 'W'
|
||||
| elif lexeme.is_digit:
|
||||
| lexeme.shape_ = 'D'
|
||||
| elif lexeme.is_punct:
|
||||
| lexeme.shape_ = 'P'
|
||||
| else:
|
||||
| lexeme.shape_ = 'M'
|
||||
| assert token.shape_ == 'W'
|
||||
|
||||
+example("Export to numpy arrays")
|
||||
pre.language-python: code
|
||||
| from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
|
||||
|
|
||||
| attr_ids = [ORTH, LIKE_URL, IS_OOV]
|
||||
| doc_array = doc.to_array(attr_ids)
|
||||
| assert doc_array.shape == (len(doc), len(attr_ids))
|
||||
| assert doc[0].orth == doc_array[0, 0]
|
||||
| assert doc[1].orth == doc_array[1, 0]
|
||||
| assert doc[0].like_url == doc_array[0, 1]
|
||||
| assert list(doc_array[:, 1]) == [t.like_url for t in doc]
|
||||
|
||||
+example("Word vectors")
|
||||
pre.language-python: code
|
||||
| doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
|
||||
|
|
||||
| apples = doc[0]
|
||||
| oranges = doc[1]
|
||||
| boots = doc[6]
|
||||
| hippos = doc[8]
|
||||
|
|
||||
| assert apples.similarity(oranges) > boots.similarity(hippos)
|
||||
|
||||
+example("Part-of-speech tags")
|
||||
pre.language-python: code
|
||||
| from spacy.parts_of_speech import ADV
|
||||
|
|
||||
| def is_adverb(token):
|
||||
| return token.pos == spacy.parts_of_speech.ADV
|
||||
|
|
||||
| # These are data-specific, so no constants are provided. You have to look
|
||||
| # up the IDs from the StringStore.
|
||||
| NNS = nlp.vocab.strings['NNS']
|
||||
| NNPS = nlp.vocab.strings['NNPS']
|
||||
| def is_plural_noun(token):
|
||||
| return token.tag == NNS or token.tag == NNPS
|
||||
|
|
||||
| def print_coarse_pos(token):
|
||||
| print(token.pos_)
|
||||
|
|
||||
| def print_fine_pos(token):
|
||||
| print(token.tag_)
|
||||
|
||||
+example("Syntactic dependencies")
|
||||
pre.language-python: code
|
||||
| def dependency_labels_to_root(token):
|
||||
| '''Walk up the syntactic tree, collecting the arc labels.'''
|
||||
| dep_labels = []
|
||||
| while token.head is not token:
|
||||
| dep_labels.append(token.dep)
|
||||
| token = token.head
|
||||
| return dep_labels
|
||||
|
||||
+example("Named entities")
|
||||
pre.language-python: code
|
||||
| def iter_products(docs):
|
||||
| for doc in docs:
|
||||
| for ent in doc.ents:
|
||||
| if ent.label_ == 'PRODUCT':
|
||||
| yield ent
|
||||
|
|
||||
| def word_is_in_entity(word):
|
||||
| return word.ent_type != 0
|
||||
|
|
||||
| def count_parent_verb_by_person(docs):
|
||||
| counts = defaultdict(defaultdict(int))
|
||||
| for doc in docs:
|
||||
| for ent in doc.ents:
|
||||
| if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
|
||||
| counts[ent.orth_][ent.root.head.lemma_] += 1
|
||||
| return counts
|
||||
|
||||
//+example("Define custom NER rules")
|
||||
// pre.language-python: code
|
||||
// | nlp.matcher
|
||||
|
||||
|
||||
+example("Calculate inline mark-up on original string")
|
||||
pre.language-python: code
|
||||
| def put_spans_around_tokens(doc, get_classes):
|
||||
| '''Given some function to compute class names, put each token in a
|
||||
| span element, with the appropriate classes computed.
|
||||
|
|
||||
| All whitespace is preserved, outside of the spans. (Yes, I know HTML
|
||||
| won't display it. But the point is no information is lost, so you can
|
||||
| calculate what you need, e.g. <br /> tags, <p> tags, etc.)
|
||||
| '''
|
||||
| output = []
|
||||
| template = '<span classes="{classes}">{word}</span>{space}'
|
||||
| for token in doc:
|
||||
| if token.is_space:
|
||||
| output.append(token.orth_)
|
||||
| else:
|
||||
| output.append(
|
||||
| template.format(
|
||||
| classes=' '.join(get_classes(token)),
|
||||
| word=token.orth_,
|
||||
| space=token.whitespace_))
|
||||
| string = ''.join(output)
|
||||
| string = string.replace('\n', '<br />')
|
||||
| string = string.replace('\t', ' ')
|
||||
| return string
|
||||
|
||||
|
||||
+example("Efficient binary serialization")
|
||||
pre.language-python: code
|
||||
| byte_string = doc.as_bytes()
|
||||
| open('/tmp/moby_dick.bin', 'wb').write(byte_string)
|
||||
|
|
||||
| nlp = spacy.en.English()
|
||||
| for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')):
|
||||
| doc = Doc(nlp.vocab)
|
||||
| doc.from_bytes(byte_string)
|
||||
|
||||
+example("Full documentation")
|
||||
ul
|
||||
li: a(href="/docs#api") API documentation
|
||||
li: a(href="/docs#tutorials") Tutorials
|
||||
li: a(href="/docs/#spec") Annotation specs
|
38
website/src/jade/home/index.jade
Normal file
38
website/src/jade/home/index.jade
Normal file
|
@ -0,0 +1,38 @@
|
|||
mixin LedeParagraph
|
||||
p
|
||||
a(href="//github.com/honnibal/spaCy"): strong spaCy
|
||||
| is a library for industrial-strength natural language processing in Python and Cython. It features
|
||||
a(href="#comparisons") state-of-the-art
|
||||
| speed and accuracy, a concise API, and
|
||||
a(href="/docs") great documentation
|
||||
| . If you're a small company doing NLP, we want
|
||||
strong spaCy
|
||||
| to seem like
|
||||
a(href="/blog/introducing-spacy") a minor miracle
|
||||
| .
|
||||
|
||||
|
||||
include ../mixins.jade
|
||||
include ../header.jade
|
||||
|
||||
- var Page = InitPage(Site, Authors.spacy, "home", Site.slogan)
|
||||
- Page.active.home = true
|
||||
|
||||
|
||||
+WritePage(Site, Authors['spacy'], Page)
|
||||
section.intro
|
||||
+LedeParagraph
|
||||
nav(role="navigation")
|
||||
ul
|
||||
li: a.button(href="#comparisons") Comparisons
|
||||
li: a.button(href="#online-demo") Try Online
|
||||
li: a.button(href="#example-use") Examples
|
||||
li: a.button(href="#install")
|
||||
| Install
|
||||
<span class="button-caption">v0.92</span>
|
||||
|
||||
article.page.landing-page
|
||||
+Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade")
|
||||
+Section("Online Demo", "online-demo", "./_online_demo.jade")
|
||||
+Section("Usage by Example", "example-use", "./_usage_examples.jade")
|
||||
+Section("Install v0.92", "install", "./_installation.jade")
|
38
website/src/jade/license/index.jade
Normal file
38
website/src/jade/license/index.jade
Normal file
|
@ -0,0 +1,38 @@
|
|||
include ../header.jade
|
||||
|
||||
mixin LicenseOption(name, period, price, audience)
|
||||
.item
|
||||
h4 #{name}
|
||||
|
||||
.focus #{period}
|
||||
|
||||
span #{price}
|
||||
|
||||
h5 Suggested for:
|
||||
|
||||
span #{audience}
|
||||
|
||||
a.button(href="/resources/pdf/spaCy_License_Agreement_2015.pdf", target="_blank") Download license
|
||||
|
||||
span or #[a(href="mailto:sales@spacy.io") get in touch]
|
||||
|
||||
- var Page = InitPage(Site, Authors.spacy, "license", "License")
|
||||
|
||||
+WritePage(Site, Authors.spacy, Page)
|
||||
article.pricing
|
||||
.box.license
|
||||
+LicenseOption("Trial", "90 days", "$0", "Evaluation")
|
||||
+LicenseOption("Production", "1 year", "$5,000", "Production")
|
||||
+LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning")
|
||||
|
||||
p.caption Researcher, hobbyist, or open-source developer? spaCy also offers #[a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3] licenses.
|
||||
|
||||
blockquote.pull-quote
|
||||
p Let's face it: Services disappear. Constantly. The good start-ups get bought; the bad ones go bankrupt.
|
||||
|
||||
p You need the source, and you need to know you can buy a long-term license. So that's what we offer. The difference between this and a black-box API is night and day.
|
||||
|
||||
p Let's face it: services disappear. Constantly. The good start-ups get bought; the bad ones go bankrupt. Open-source projects become abandoned or bloated. Google's graveyard is over-flowing – ditto for Yahoo!, Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset?
|
||||
|
||||
p A 5 year license won't expire until 2020. spaCy will be with you for longer than most of your current staff. If that's still not enough, get in touch. We can surely work something out.
|
||||
|
32
website/src/jade/mixins.jade
Normal file
32
website/src/jade/mixins.jade
Normal file
|
@ -0,0 +1,32 @@
|
|||
mixin Section(title_text, link_name, include_file)
|
||||
|
||||
h2: a(name=link_name href="#" + link_name) #{title_text}
|
||||
|
||||
if (link_name == "example-use")
|
||||
include ./home/_usage.jade
|
||||
else if (link_name == "online-demo")
|
||||
include ./home/_online_demo.jade
|
||||
else if (link_name == "comparisons")
|
||||
include ./home/_comparisons.jade
|
||||
else if (link_name == "install")
|
||||
include ./home/_installation.jade
|
||||
else if (link_name == "api")
|
||||
include ./docs/_api.jade
|
||||
else if (link_name == "tutorials")
|
||||
include ./tutorials/_teaser.jade
|
||||
else if (link_name == "spec")
|
||||
include ./docs/_spec.jade
|
||||
|
||||
|
||||
mixin columns(...names)
|
||||
tr
|
||||
each name in names
|
||||
th= name
|
||||
|
||||
|
||||
mixin row(...cells)
|
||||
tr
|
||||
each cell in cells
|
||||
td= cell
|
||||
|
||||
|
16
website/src/jade/tutorials/_teaser.jade
Normal file
16
website/src/jade/tutorials/_teaser.jade
Normal file
|
@ -0,0 +1,16 @@
|
|||
mixin Tutorial(name)
|
||||
if name == "mark-adverbs"
|
||||
include ./mark-adverbs/meta.jade
|
||||
else if name == "syntax-search"
|
||||
include ./syntax-search/meta.jade
|
||||
else if name == "twitter-filter"
|
||||
include ./twitter-filter/meta.jade
|
||||
details(open)
|
||||
summary
|
||||
h4= Meta.headline
|
||||
p #[a(href=Meta.url) #{Meta.description}] #[a.readmore(href=Meta.url) ►]
|
||||
|
||||
|
||||
+Tutorial("mark-adverbs")
|
||||
+Tutorial("syntax-search")
|
||||
+Tutorial("twitter-filter")
|
136
website/src/jade/tutorials/add-a-language/index.jade
Normal file
136
website/src/jade/tutorials/add-a-language/index.jade
Normal file
|
@ -0,0 +1,136 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
|
||||
+WritePost(Meta)
|
||||
section.intro
|
||||
p This document explains how to add languages to spaCy. It will be updated and improved as languages are added. For now, focus is on adding a new tokenizer. Further resources require annotated and unannotated data. For annotated data, we will select and license appropriate treebanks. For unannotated data, we will default to using Wikipedia. The running example will be German.
|
||||
|
||||
details(open=true)
|
||||
summary: h4 Setup
|
||||
|
||||
p Create the directories that the language data and the model class reside in.
|
||||
|
||||
pre.language-bash: code
|
||||
| $ mkdir spacy/de
|
||||
| $ mkdir lang_data/de
|
||||
| $ mkdir corpora/de
|
||||
|
||||
p Write an initial #[code spacy/de/__init__.py] file.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from __future__ import unicode_literals, print_function
|
||||
| from os import path
|
||||
| from ..language import Language
|
||||
|
|
||||
| class German(Language):
|
||||
| @classmethod
|
||||
| def default_data_dir(cls):
|
||||
| return path.join(path.dirname(__file__), 'data')
|
||||
|
||||
details(open=true)
|
||||
summary: h4 Tokenizer
|
||||
|
||||
p spaCy's tokenizer first collects a "chunk" of characters. To allow alignment, whitespace characters are preserved, but in separate chunks. However, chunks which are simply the character ' ' are suppressed. For example:
|
||||
|
||||
table
|
||||
tr
|
||||
td 'One two, "three"'
|
||||
td ['One', 'two,', '"three"']
|
||||
tr
|
||||
td 'One three four'
|
||||
td ['One', ' ', 'three', 'four']
|
||||
tr
|
||||
td ' \t\n two three'
|
||||
td [' \t\n ', 'two', 'three']
|
||||
|
||||
p The chunks are then tokenized further, using prefix, suffix and infix expressions, with special-cases specified separately, to handle contractions, fused tokens, and tokens with protected punctuation or affix characters. Specifying these special-cases separately greatly simplifies the rules.
|
||||
|
||||
|
||||
details
|
||||
summary: h4 lang_data/de/prefix.txt
|
||||
|
||||
pre
|
||||
| ,
|
||||
| "
|
||||
| (
|
||||
| [
|
||||
| {
|
||||
| *
|
||||
| <
|
||||
| $
|
||||
| £
|
||||
| “
|
||||
| '
|
||||
| ``
|
||||
| `
|
||||
| #
|
||||
| ‘
|
||||
| ....
|
||||
| ...
|
||||
details
|
||||
summary: h4 lang_data/de/suffix.txt
|
||||
|
||||
pre
|
||||
| )
|
||||
| ]
|
||||
| }
|
||||
| "
|
||||
| ;
|
||||
| .
|
||||
| ,
|
||||
|
||||
details
|
||||
summary: h4 lang_data/de/infix.txt
|
||||
|
||||
pre
|
||||
| \.\.\.
|
||||
| (?<=[a-z])\.(?=[A-Z])
|
||||
| (?<=[a-zA-Z])-(?=[a-zA-z])
|
||||
|
||||
details
|
||||
summary: h4 specials.json
|
||||
|
||||
p A lookup-table is used to handle contractions and fused tokens. The lookup table is specified in a json file. Each entry is keyed by the string. Its value is a list of token specifications, including the form, lemma, part-of-speech, and morphological features.
|
||||
|
||||
details
|
||||
summary: h4 lemma_rules.json
|
||||
|
||||
p Write lemmatization rules, and a list of exceptions. The English lemmatization rules can be seen #[a(href="https://github.com/honnibal/spaCy/blob/master/lang_data/en/lemma_rules.json") here], used by the lemmatizer #[a(href="https://github.com/honnibal/spaCy/blob/master/spacy/lemmatizer.py") here].
|
||||
|
||||
details
|
||||
summary: h4 tag_map.json
|
||||
|
||||
p Write a #[a(href="https://github.com/honnibal/spaCy/blob/master/lang_data/en/tag_map.json") tag_map.json] file, which maps the language-specific treebank tag scheme to the #[a(href="http://universaldependencies.github.io/docs/") universal part-of-speech scheme], with additional morphological features.
|
||||
|
||||
details
|
||||
summary: h4 morphs.json
|
||||
|
||||
p Write a #[a(href="https://github.com/honnibal/spaCy/blob/master/lang_data/en/morphs.json") morphs.json] file, which lists special-cases for the morphological analyzer and lemmatizer. This file is keyed by the part-of-speech tag, and then by a list of orthographic forms. This allows words which require different morphological features, depending on their part-of-speech tag, to receive them.
|
||||
|
||||
details
|
||||
summary: h4 Prepare Wikipedia dump
|
||||
|
||||
p See #[a(href="https://github.com/wikilinks/wikijson") here].
|
||||
|
||||
|
||||
details: summary: h4 Create frequencies
|
||||
pre.language-bash: code
|
||||
$ python bin/get_freqs.py
|
||||
$ python bin/gather_freqs.py
|
||||
|
||||
details: summary: h4 Brown clusters
|
||||
|
||||
p See #[a(href="https://github.com/percyliang/brown-cluster") here].
|
||||
|
||||
details: summary: h4 Over-write attribute functions if necessary
|
||||
|
||||
details: summary: h4 Run init_model.py
|
||||
|
||||
details: summary: h4 Train part-of-speech tagger
|
||||
|
||||
details: summary: h4 Train dependency parser
|
||||
|
||||
details: summary: h4 Train entity recogniser
|
||||
|
8
website/src/jade/tutorials/add-a-language/meta.jade
Normal file
8
website/src/jade/tutorials/add-a-language/meta.jade
Normal file
|
@ -0,0 +1,8 @@
|
|||
- var Meta = {}
|
||||
- Meta.author_id = 'spacy'
|
||||
- Meta.headline = "Tutorial: Adding a language to spaCy"
|
||||
- Meta.description = "Long awaited documentation for adding a language to spaCy"
|
||||
- Meta.date = "2015-08-18"
|
||||
- Meta.url = "/tutorials/add-a-language"
|
||||
- Meta.active = { "blog": true }
|
||||
- Meta.links = []
|
5
website/src/jade/tutorials/customizing-spacy.jade
Normal file
5
website/src/jade/tutorials/customizing-spacy.jade
Normal file
|
@ -0,0 +1,5 @@
|
|||
include ./meta.jade
|
||||
include ../header.jade
|
||||
|
||||
+WritePost(Meta)
|
||||
p
|
210
website/src/jade/tutorials/dan-text-class.jade
Normal file
210
website/src/jade/tutorials/dan-text-class.jade
Normal file
|
@ -0,0 +1,210 @@
|
|||
include ./meta.jade
|
||||
include ../../header.jade
|
||||
|
||||
|
||||
+WritePost(Meta)
|
||||
pre.language: code
|
||||
| def main():
|
||||
| nlp = English()
|
||||
| ex = Extractor(nlp, width, dropout_rate)
|
||||
| adagrad = Adagrad(learning_rate)
|
||||
| nn = NeuralNetwork(depth, width, n_classes, ex.doc2bow, ex.bow2vec, adagrad)
|
||||
| train(nn, trn, dev, n_epoch=n_epoch, batch_size=batch_size)
|
||||
|
|
||||
|
|
||||
| def train(model, trn_data, dev_data, n_epochs=5, batch_size=24):
|
||||
| def minibatch(data):
|
||||
| random.shuffle(data)
|
||||
| for i in range(0, len(data), batch_size):
|
||||
| yield data[i:i+batch_size]
|
||||
|
|
||||
| for epoch in range(n_iter):
|
||||
| train_loss = 0.0
|
||||
| for batch in minibatch(trn_data):
|
||||
| train_loss += model.train(batch)
|
||||
| accuracy = sum(model.predict(x) == y for x, y in dev_data)
|
||||
| report_and_save(epoch, train_loss, accuracy, model)
|
||||
|
|
||||
|
|
||||
| class Extractor(object):
|
||||
| def __init__(self, nlp, vector_length, dropout_rate=0.3):
|
||||
| self.nlp = nlp
|
||||
| self.dropout_rate = dropout_rate
|
||||
| self.vector = numpy.zeros((vector_length, ))
|
||||
|
|
||||
| def doc2bow(self, doc, dropout_rate=0.3):
|
||||
| bow = defaultdict(int)
|
||||
| for word in doc:
|
||||
| if keep[word.i] >= dropout_rate and not word.is_punct:
|
||||
| bow[word.orth] += 1
|
||||
| return bow
|
||||
|
|
||||
| def bow2vec(self, bow):
|
||||
| self.vector.fill(0)
|
||||
| n = 0
|
||||
| for orth_id, freq in bow.items():
|
||||
| self.vector += self.nlp.vocab[orth].repvec * freq
|
||||
| n += freq
|
||||
| return self.vector / n
|
||||
|
|
||||
|
|
||||
| class NeuralNetwork(object):
|
||||
| def __init__(self, depth, width, n_classes, doc2bow, bow2vec, optimizer):
|
||||
| self.depth = depth
|
||||
| self.width = width
|
||||
| self.n_classes = n_classes
|
||||
| self.weights = Params.random(depth, width, n_classes)
|
||||
| self.doc2bow = doc2bow
|
||||
| self.bow2vec = bow2vec
|
||||
| self.optimizer = optimizer
|
||||
| self._gradient = Params.zero(depth, width, n_classes)
|
||||
| self._activity = numpy.zeros((self.depth, dimensions['hidden']))
|
||||
|
|
||||
| def train(self, batch):
|
||||
| activity = self._activity
|
||||
| gradient = self._gradient
|
||||
| activity.fill(0)
|
||||
| gradient.fill(0)
|
||||
| loss = 0
|
||||
| for doc, label in batch:
|
||||
| word_ids = self.doc2bow(doc)
|
||||
| vector = self.bow2vec(word_ids, fine_tuning=self.params.E)
|
||||
| self.forward(activity, vector)
|
||||
| loss += self.backprop(gradient, activity, word_ids, label)
|
||||
| self.optimizer(self.weights, self.gradient, word_freqs)
|
||||
|
|
||||
| def forward(self, actv, in_):
|
||||
| W = self.weights.W; b = self.weights.b
|
||||
| actv[0] = relu(in_, W[0], b[0])
|
||||
| for i in range(1, self.depth):
|
||||
| actv[i] = relu(actv[i-1], W[i], b[i])
|
||||
|
|
||||
| def backprop(self, gradient, actvity, ids, label):
|
||||
| W = self.weights.W; b = self.weights.b
|
||||
| target = zeros(self.n_classes)
|
||||
| target[label] = 1.0
|
||||
| pred = softmax(activty[-1], W[-1], b[-1])
|
||||
| delta = pred - target
|
||||
|
|
||||
| for i in range(self.depth, 0, -1):
|
||||
| gradient.b[i] += delta
|
||||
| gradient.W[i] += outer(delta, activity[i-1])
|
||||
| delta = d_relu(activity[i-1]) * W[i].T.dot(delta)
|
||||
|
|
||||
| gradient.b[0] += delta
|
||||
| gradient.W[0] += outer(delta, input_vector)
|
||||
| tuning = W[0].T.dot(D).reshape((self.width,)) / len(ids)
|
||||
| for w in ids:
|
||||
| if w < self.n_vocab:
|
||||
| gradient.E[w] += tuning
|
||||
|
|
||||
|
|
||||
| def softmax(actvn, W, b):
|
||||
| w = W.dot(actvn) + b
|
||||
| ew = exp(w - max(w))
|
||||
| return (ew / sum(ew)).ravel()
|
||||
|
|
||||
|
|
||||
| def relu(actvn, W, b):
|
||||
| x = W.dot(actvn) + b
|
||||
| return x * (x > 0)
|
||||
|
|
||||
|
|
||||
| def d_relu(x):
|
||||
| return x > 0
|
||||
|
|
||||
|
|
||||
| class Adagrad(object):
|
||||
| def __init__(self, dim, lr):
|
||||
| self.dim = dim
|
||||
| self.eps = 1e-3
|
||||
| # initial learning rate
|
||||
| self.learning_rate = lr
|
||||
| # stores sum of squared gradients
|
||||
| self.h = zeros(self.dim)
|
||||
| self._curr_rate = zeros(self.h.shape)
|
||||
|
|
||||
| def __call__(self, weights, gradient, batch_size, word_freqs):
|
||||
| update = self.rescale(gradient.data / batch_size)
|
||||
| weights.data -= update
|
||||
|
|
||||
| def rescale(self, gradient):
|
||||
| self._curr_rate.fill(0)
|
||||
| self.h += gradient ** 2
|
||||
| self._curr_rate = self.learning_rate / (sqrt(self.h) + self.eps)
|
||||
| return self._curr_rate * gradient
|
||||
|
|
||||
| def L2_penalty(self, gradient, weights, word_freqs):
|
||||
| # L2 Regularization
|
||||
| for i in range(weights.depth):
|
||||
| gradient.W[i] += weights.W[i] * self.rho
|
||||
| gradient.b[i] += weights.b[i] * self.rho
|
||||
| for w, freq in word_freqs.items():
|
||||
| gradient.E[w] += (weights.E[w] * freq) * self.rho
|
||||
|
|
||||
|
|
||||
| class Params(object):
|
||||
| @classmethod
|
||||
| def zero(cls, depth, n_embed, n_hidden, n_labels, n_vocab):
|
||||
| return cls(depth, n_embed, n_hidden, n_labels, n_vocab, lambda x: zeros((x,)))
|
||||
|
|
||||
| @classmethod
|
||||
| def random(cls, depth, nE, nH, nL, nV):
|
||||
| return cls(depth, nE, nH, nL, nV, lambda x: (random.rand(x) * 2 - 1) * 0.08)
|
||||
|
|
||||
| @classmethod
|
||||
| def identity(cls, depth, nE, nH, nL, nV):
|
||||
| params = []
|
||||
| params.append(identity(nH))
|
||||
| params.append(zeros((nH, )))
|
||||
| for i in range(1, depth):
|
||||
| params.append(identity(nH))
|
||||
| params.append(zeros((nH, )))
|
||||
| params.append(zeros((nH, nL)))
|
||||
| params.append(zeros((nL, )))
|
||||
| params.append(zeros((nV, nE)))
|
||||
| return concatenate([p.ravel() for p in params])
|
||||
|
|
||||
| def __init__(self, depth, n_embed, n_hidden, n_labels, n_vocab, initializer):
|
||||
| nE = n_embed; nH = n_hidden; nL = n_labels; nV = n_vocab
|
||||
| n_weights = sum([
|
||||
| (nE * nH) + nH,
|
||||
| (nH * nH + nH) * depth,
|
||||
| (nH * nL) + nL,
|
||||
| (nV * nE)
|
||||
| ])
|
||||
| self.data = initializer(n_weights)
|
||||
| self.W = []
|
||||
| self.b = []
|
||||
| i = self._add_layer(0, nE, nH)
|
||||
| for _ in range(1, depth):
|
||||
| i = self._add_layer(i, nH, nH)
|
||||
| i = self._add_layer(i, nL, nH)
|
||||
| self.E = self.data[i : i + (nV * nE)].reshape((nV, nE))
|
||||
| self.E.fill(0)
|
||||
|
|
||||
| def _add_layer(self, start, x, y):
|
||||
| end = start + (x * y)
|
||||
| self.W.append(self.data[start : end].reshape((x, y)))
|
||||
| self.b.append(self.data[end : end + x].reshape((x, )))
|
||||
| return end + x
|
||||
|
|
||||
|
|
||||
| def read_data(nlp, data_dir):
|
||||
| for subdir, label in (('pos', 1), ('neg', 0)):
|
||||
| for filename in (data_dir / subdir).iterdir():
|
||||
| text = filename.open().read()
|
||||
| doc = nlp(text)
|
||||
| yield doc, label
|
||||
|
|
||||
|
|
||||
| def partition(examples, split_size):
|
||||
| examples = list(examples)
|
||||
| random.shuffle(examples)
|
||||
| n_docs = len(examples)
|
||||
| split = int(n_docs * split_size)
|
||||
| return examples[:split], examples[split:]
|
||||
|
||||
|
||||
|
||||
|
0
website/src/jade/tutorials/index.jade
Normal file
0
website/src/jade/tutorials/index.jade
Normal file
51
website/src/jade/tutorials/load-new-word-vectors/index.jade
Normal file
51
website/src/jade/tutorials/load-new-word-vectors/index.jade
Normal file
|
@ -0,0 +1,51 @@
|
|||
include ./meta.jade
|
||||
include ../header.jade
|
||||
|
||||
+WritePost(Meta)
|
||||
|
||||
p By default spaCy loads a #[code data/vocab/vec.bin] file, where the #[em data] directory is within the #[code spacy.en] module directory.
|
||||
|
||||
p You can customize the word vectors loaded by spaCy in three different ways. For the first two, you'll need to convert your vectors into spaCy's binary file format. The binary format is used because it's smaller and loads faster.
|
||||
|
||||
p You can either place the binary file in the location spaCy expects
|
||||
|
||||
pre
|
||||
code.language-python
|
||||
| from spacy.vocab import write_binary_vectors
|
||||
| import spacy.en
|
||||
| from os import path
|
||||
|
|
||||
| def main(bz2_loc, bin_loc=None):
|
||||
| if bin_loc is None:
|
||||
| bin_loc = path.join(path.dirname(spacy.en.__file__), 'data', 'vocab', 'vec.bin')
|
||||
| write_binary_vectors(bz2_loc, bin_loc)
|
||||
|
|
||||
| if __name__ == '__main__':
|
||||
| plac.call(main)
|
||||
|
||||
|
||||
|
||||
ol
|
||||
li Replace the vec.bin, so your vectors will be loaded by default. The function #[code spacy.vocab.write_binary_vectors] is provided to convert files to spaCy's binary format. The advantage of the binary format is that it's smaller and loads faster.
|
||||
|
||||
li Load vectors at run-time
|
||||
|
||||
|
||||
Create the vec.bin file from a bz2 file using spacy.vocab.write_binary_vectors
|
||||
Either replace spaCy's vec.bin file, or call nlp.vocab.load_rep_vectors at run-time, with the path to the binary file.
|
||||
The above is a bit inconvenient at first, but the binary file format is much smaller and faster to load, and the vectors files are fairly big. Note that GloVe distributes in gzip format, not bzip.
|
||||
|
||||
Out of interest: are you using the GloVe vectors, or something you trained on your own data? If your own data, did you use Gensim? I'd like to make this much easier, so I'd appreciate suggestions for what work-flow you'd like to see.
|
||||
Load new vectors at run-time, optionally converting them
|
||||
|
||||
pre
|
||||
code.language-python
|
||||
| import spacy.vocab
|
||||
|
||||
| def set_spacy_vectors(nlp, binary_loc, bz2_loc=None):
|
||||
| if bz2_loc is not None:
|
||||
| spacy.vocab.write_binary_vectors(bz2_loc, binary_loc)
|
||||
| write_binary_vectors(bz2_input_loc, binary_loc)
|
||||
|
|
||||
| nlp.vocab.load_rep_vectors(binary_loc)
|
||||
|
132
website/src/jade/tutorials/mark-adverbs/index.jade
Normal file
132
website/src/jade/tutorials/mark-adverbs/index.jade
Normal file
|
@ -0,0 +1,132 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
+WritePost(Meta)
|
||||
p Let's say you're developing a proofreading tool, or possibly an IDE for writers. You're convinced by Stephen King's advice that #[a(href="http://www.brainpickings.org/2013/03/13/stephen-king-on-adverbs") adverbs are not your friend] so you want to #[strong highlight all adverbs]. We'll use one of the examples he finds particularly egregious:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV
|
||||
| >>> # Load the pipeline, and call it with some text.
|
||||
| >>> nlp = spacy.en.English()
|
||||
| >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", tag=True, parse=False)
|
||||
| >>> print u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
|
||||
| u‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
p Easy enough --- but the problem is that we've also highlighted "back". While "back" is undoubtedly an adverb, we probably don't want to highlight it. If what we're trying to do is flag dubious stylistic choices, we'll need to refine our logic. It turns out only a certain type of adverb is of interest to us.
|
||||
|
||||
p There are lots of ways we might do this, depending on just what words we want to flag. The simplest way to exclude adverbs like "back" and "not" is by word frequency: these words are much more common than the prototypical manner adverbs that the style guides are worried about.
|
||||
|
||||
p The #[code Lexeme.prob] and #[code Token.prob] attribute gives a log probability estimate of the word:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> nlp.vocab[u'back'].prob
|
||||
| -7.403977394104004
|
||||
| >>> nlp.vocab[u'not'].prob
|
||||
| -5.407193660736084
|
||||
| >>> nlp.vocab[u'quietly'].prob
|
||||
| -11.07155704498291
|
||||
|
||||
p (The probability estimate is based on counts from a 3 billion word corpus, smoothed using the `Simple Good-Turing`_ method.)
|
||||
|
||||
p So we can easily exclude the N most frequent words in English from our adverb marker. Let's try N=1000 for now:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV
|
||||
| >>> nlp = spacy.en.English()
|
||||
| >>> # Find log probability of Nth most frequent word
|
||||
| >>> probs = [lex.prob for lex in nlp.vocab]
|
||||
| >>> probs.sort()
|
||||
| >>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
|
||||
| >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
|
||||
| >>> print u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
|
||||
| ‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
p There are lots of other ways we could refine the logic, depending on just what words we want to flag. Let's say we wanted to only flag adverbs that modified words similar to "pleaded". This is easy to do, as spaCy loads a vector-space representation for every word (by default, the vectors produced by `Levy and Goldberg (2014)`_). Naturally, the vector is provided as a numpy array:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> pleaded = tokens[7]
|
||||
| >>> pleaded.vector.shape
|
||||
| (300,)
|
||||
| >>> pleaded.vector[:5]
|
||||
| array([ 0.04229792, 0.07459262, 0.00820188, -0.02181299, 0.07519238], dtype=float32)
|
||||
|
||||
p We want to sort the words in our vocabulary by their similarity to "pleaded". There are lots of ways to measure the similarity of two vectors. We'll use the cosine metric:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> from numpy import dot
|
||||
| >>> from numpy.linalg import norm
|
||||
|
||||
| >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
| >>> words = [w for w in nlp.vocab if w.has_vector]
|
||||
| >>> words.sort(key=lambda w: cosine(w.vector, pleaded.vector))
|
||||
| >>> words.reverse()
|
||||
| >>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
|
||||
| 1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading
|
||||
| >>> print('50-60', ', '.join(w.orth_ for w in words[50:60]))
|
||||
| 50-60 counselled, bragged, backtracked, caucused, refiled, dueled, mused, dissented, yearned, confesses
|
||||
| >>> print('100-110', ', '.join(w.orth_ for w in words[100:110]))
|
||||
| 100-110 cabled, ducked, sentenced, perjured, absconded, bargained, overstayed, clerked, confided, sympathizes
|
||||
| >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010]))
|
||||
| 1000-1010 scorned, baled, righted, requested, swindled, posited, firebombed, slimed, deferred, sagged
|
||||
| >>> print('50000-50010', ', '.join(w.orth_ for w in words[50000:50010]))
|
||||
| 50000-50010, fb, ford, systems, puck, anglers, ik, tabloid, dirty, rims, artists
|
||||
|
||||
p As you can see, the similarity model that these vectors give us is excellent — we're still getting meaningful results at 1000 words, off a single prototype! The only problem is that the list really contains two clusters of words: one associated with the legal meaning of "pleaded", and one for the more general sense. Sorting out these clusters is an area of active research.
|
||||
|
||||
p A simple work-around is to average the vectors of several words, and use that as our target:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested']
|
||||
| >>> say_vector = sum(nlp.vocab[verb].vector for verb in say_verbs) / len(say_verbs)
|
||||
| >>> words.sort(key=lambda w: cosine(w.vector * say_vector))
|
||||
| >>> words.reverse()
|
||||
| >>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
|
||||
| 1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired
|
||||
| >>> print('50-60', ', '.join(w.orth_ for w in words[50:60]))
|
||||
| 50-60 flaunted, quarrelled, ingratiated, vouched, agonized, apologised, lunched, joked, chafed, schemed
|
||||
| >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010]))
|
||||
| 1000-1010 hoarded, waded, ensnared, clamoring, abided, deploring, shriveled, endeared, rethought, berate
|
||||
|
||||
p These definitely look like words that King might scold a writer for attaching adverbs to. Recall that our original adverb highlighting function looked like this:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV
|
||||
| >>> # Load the pipeline, and call it with some text.
|
||||
| >>> nlp = spacy.en.English()
|
||||
| >>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
|
||||
| tag=True, parse=False)
|
||||
| >>> print(''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens))
|
||||
| ‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
|
||||
p We wanted to refine the logic so that only adverbs modifying evocative verbs of communication, like "pleaded", were highlighted. We've now built a vector that represents that type of word, so now we can highlight adverbs based on subtle logic, honing in on adverbs that seem the most stylistically problematic, given our starting assumptions:
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| >>> import numpy
|
||||
| >>> from numpy import dot
|
||||
| >>> from numpy.linalg import norm
|
||||
| >>> import spacy.en
|
||||
| >>> from spacy.parts_of_speech import ADV, VERB
|
||||
| >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
| >>> def is_bad_adverb(token, target_verb, tol):
|
||||
| ... if token.pos != ADV
|
||||
| ... return False
|
||||
| ... elif token.head.pos != VERB:
|
||||
| ... return False
|
||||
| ... elif cosine(token.head.vector, target_verb) < tol:
|
||||
| ... return False
|
||||
| ... else:
|
||||
| ... return True
|
||||
|
||||
p This example was somewhat contrived — and, truth be told, I've never really bought the idea that adverbs were a grave stylistic sin. But hopefully it got the message across: the state-of-the-art NLP technologies are very powerful. spaCy gives you easy and efficient access to them, which lets you build all sorts of useful products and features that were previously impossible.
|
8
website/src/jade/tutorials/mark-adverbs/meta.jade
Normal file
8
website/src/jade/tutorials/mark-adverbs/meta.jade
Normal file
|
@ -0,0 +1,8 @@
|
|||
- var Meta = {}
|
||||
- Meta.headline = "Tutorial: Mark all adverbs, particularly for verbs of speech"
|
||||
- Meta.author_id = 'matt'
|
||||
- Meta.description = "Let's say you're developing a proofreading tool, or possibly an IDE for writers. You're convinced by Stephen King's advice that adverbs are not your friend so you want to highlight all adverbs."
|
||||
- Meta.date = "2015-08-18"
|
||||
- Meta.url = "/tutorials/mark-adverbs"
|
||||
- Meta.active = { "blog": true }
|
||||
- Meta.links = []
|
29
website/src/jade/tutorials/multilingual.jade
Normal file
29
website/src/jade/tutorials/multilingual.jade
Normal file
|
@ -0,0 +1,29 @@
|
|||
include ./meta.jade
|
||||
include ../header.jade
|
||||
|
||||
+WritePost(Meta)
|
||||
h3 Overview
|
||||
|
||||
|
||||
p Each language requires the following definition files for the tokenizer, morphological analyzer and lemmatizer:
|
||||
|
||||
ol
|
||||
li Adapt the punctuation rules if necessary. Punctuation rules are defined in separate #[a(href="https://github.com/honnibal/spaCy/blob/master/lang_data/en/prefix.txt") prefix], #[a(href="https://github.com/honnibal/spaCy/blob/master/lang_data/en/suffix.txt") suffix] and #[a(href="https://github.com/honnibal/spaCy/blob/master/lang_data/en/infix.txt") infix] files. Most languages will not require many changes for these files.
|
||||
li Specify tokenizer special-cases. A lookup-table is used to handle contractions and fused tokens. The lookup table is specified in a json file. Each entry is keyed by the string. Its value is a list of token specifications, including the form, lemma, part-of-speech, and morphological features.
|
||||
li Write lemmatization rules, and a list of exceptions. The English lemmatization rules can be seen #[a(href="https://github.com/honnibal/spaCy/blob/master/lang_data/en/lemma_rules.json") here], used by the lemmatizer #[a(href="https://github.com/honnibal/spaCy/blob/master/spacy/lemmatizer.py") here].
|
||||
li Write a #[a(href="https://github.com/honnibal/spaCy/blob/master/lang_data/en/tag_map.json") tag_map.json] file, which maps the language-specific treebank tag scheme to the #[a(href="http://universaldependencies.github.io/docs/") universal part-of-speech scheme], with additional morphological features.
|
||||
li Write a #[a(href="https://github.com/honnibal/spaCy/blob/master/lang_data/en/morphs.json") morphs.json] file, which lists special-cases for the morphological analyzer and lemmatizer. This file is keyed by the part-of-speech tag, and then by a list of orthographic forms. This allows words which require different morphological features, depending on their part-of-speech tag, to receive them.
|
||||
|
||||
h3 Tokenization algorithm
|
||||
|
||||
h3 Producing the Brown clusters
|
||||
|
||||
p See #[a(href="https://github.com/percyliang/brown-cluster") here].
|
||||
|
||||
h3 Producing word frequencies
|
||||
|
||||
p See #[a(href="https://github.com/honnibal/spaCy/blob/master/bin/get_freqs.py") here].
|
||||
|
||||
h3 Train tagger, dependency parser and named entity recognizer
|
||||
|
||||
p These require annotated data, which we typically must license.
|
19
website/src/jade/tutorials/set-lexeme-attrs.jade
Normal file
19
website/src/jade/tutorials/set-lexeme-attrs.jade
Normal file
|
@ -0,0 +1,19 @@
|
|||
include ./meta.jade
|
||||
include ../headers.jade
|
||||
|
||||
+WritePost(Meta)
|
||||
h3 Job sizes grow faster than CPUs improve (so write fast code)
|
||||
|
||||
p The common wisdom in software engineering is that computers are fast and getting faster, so even if your job is compute-bound today, it may well be network or disk-bound tomorrow. So if you spend time optimizing performance, you're creating a depreciating asset: after time passes, it won't matter.
|
||||
|
||||
p This is often not true for natural language processing. The total compute-hours needed to process the entire web is growing, not shrinking. And we often do want to process the entire web, to answer a very specific question, to find opinions about a niche product or a local service, to frecast a trend as early as possible, to collect data for minority languages, etc.
|
||||
|
||||
p Lots of interesting NLP applications are needle-in-a-haystack problems. The haystack is getting bigger, so we can't sit around and wait for Moore's law to make everything easy. We have to write fast code.
|
||||
|
||||
h3 Caching can be unreasonably effective (so manage your memory)
|
||||
|
||||
p Zipf's law is probably the most important thing to learn in computational linguistics. We're otherwise pretty short of general truths, and this one's non-obvious and very useful.
|
||||
|
||||
h3 Sparse data structures are often essential (so your code may not look like the math)
|
||||
|
||||
h3 Exact inference is mostly irrelevant (so just use beam-search)
|
91
website/src/jade/tutorials/syntax-search/index.jade
Normal file
91
website/src/jade/tutorials/syntax-search/index.jade
Normal file
|
@ -0,0 +1,91 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
+WritePost(Meta)
|
||||
section.intro
|
||||
p Example use of the #[a(href="/docs") spaCy NLP tools] for data exploration. Here we will look for Reddit comments that describe Google doing something, i.e. discuss the company's actions. This is difficult, because other senses of "Google" now dominate usage of the word in conversation, particularly references to using Google products.
|
||||
|
||||
p The heuristics used are quick and dirty – about 5 minutes work.
|
||||
|
||||
details(open)
|
||||
summary: h4 Imports
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from __future__ import unicode_literals
|
||||
| from __future__ import print_function
|
||||
| import sys
|
||||
|
|
||||
| import plac
|
||||
| import bz2
|
||||
| import ujson
|
||||
| import spacy.en
|
||||
|
||||
details(open)
|
||||
summary: h4 Load the model and iterate over the data
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def main(input_loc):
|
||||
| nlp = spacy.en.English() # Load the model takes 10-20 seconds.
|
||||
| for line in bz2.BZ2File(input_loc): # Iterate over the Reddit comments from the dump.
|
||||
| comment_str = ujson.loads(line)['body'] # Parse the json object, and extract the 'body' attribute.
|
||||
|
||||
details(open)
|
||||
summary: h4 Apply the spaCy NLP pipeline, and look for the cases we want
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| comment_parse = nlp(comment_str)
|
||||
| for word in comment_parse:
|
||||
| if google_doing_something(word):
|
||||
| # Print the clause
|
||||
| print(''.join(w.string for w in word.head.subtree).strip())
|
||||
|
||||
details(open)
|
||||
summary: h4 Define the filter function
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| def google_doing_something(w):
|
||||
| if w.lower_ != 'google':
|
||||
| return False
|
||||
| # Is it the subject of a verb?
|
||||
| elif w.dep_ != 'nsubj':
|
||||
| return False
|
||||
| # And not 'is'
|
||||
| elif w.head.lemma_ == 'be' and w.head.dep_ != 'aux':
|
||||
| return False
|
||||
| # Exclude e.g. "Google says..."
|
||||
| elif w.head.lemma_ in ('say', 'show'):
|
||||
| return False
|
||||
| else:
|
||||
| return True
|
||||
|
||||
details(open)
|
||||
summary: h4 Call main
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| if __name__ == '__main__':
|
||||
| plac.call(main)
|
||||
|
||||
details(open)
|
||||
summary: h4 Example output
|
||||
|
||||
p Many false positives remain. Some are from incorrect interpretations of the sentence by spaCy, some are flaws in our filtering logic. But the results are vastly better than a string-based search, which returns almost no examples of the pattern we're looking for.
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| Google dropped support for Android < 4.0 already
|
||||
| google drive
|
||||
| Google to enforce a little more uniformity in its hardware so that we can see a better 3rd party market for things like mounts, cases, etc
|
||||
| When Google responds
|
||||
| Google translate cyka pasterino.
|
||||
| A quick google looks like Synology does have a sync'ing feature which does support block level so that should work
|
||||
| (google came up with some weird One Piece/FairyTail crossover stuff), and is their knowledge universally infallible?
|
||||
| Until you have the gear, google some videos on best farming runs on each planet, you can get a lot REAL fast with the right loop.
|
||||
| Google offers something like this already, but it is truly terrible.
|
||||
| google isn't helping me
|
||||
| Google tells me: 0 results, 250 pages removed from google.
|
||||
| how did Google swoop in and eat our lunch
|
8
website/src/jade/tutorials/syntax-search/meta.jade
Normal file
8
website/src/jade/tutorials/syntax-search/meta.jade
Normal file
|
@ -0,0 +1,8 @@
|
|||
- var Meta = {}
|
||||
- Meta.headline = "Tutorial: Search Reddit for comments about Google doing something"
|
||||
- Meta.description = "Example use of the spaCy NLP tools for data exploration. Here we will look for Reddit comments that describe Google doing something, i.e. discuss the company's actions. This is difficult, because other senses of \"Google\" now dominate usage of the word in conversation, particularly references to using Google products."
|
||||
- Meta.author_id = "matt"
|
||||
- Meta.date = "2015-08-18"
|
||||
- Meta.url = "/tutorials/syntax-search"
|
||||
- Meta.active = { "blog": true }
|
||||
- Meta.links = []
|
167
website/src/jade/tutorials/twitter-filter/index.jade
Normal file
167
website/src/jade/tutorials/twitter-filter/index.jade
Normal file
|
@ -0,0 +1,167 @@
|
|||
include ../../header.jade
|
||||
include ./meta.jade
|
||||
|
||||
|
||||
+WritePost(Meta)
|
||||
section.intro
|
||||
p #[a(href="http://spaCy.io") spaCy] is great for data exploration. Poking, prodding and sifting is fundamental to good data science. In this tutorial, we'll do a broad keword search of Twitter, and then sift through the live stream of tweets, zooming in on some topics and excluding others.
|
||||
|
||||
p An example filter-function:
|
||||
|
||||
pre.language-python: code
|
||||
| accept = map(get_vector, 'Jeb Cheney Republican 9/11h'.split())
|
||||
| reject = map(get_vector, 'garden Reggie hairy'.split())
|
||||
|
|
||||
| y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept)
|
||||
| n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject)
|
||||
|
|
||||
| if (y / (y + n)) >= 0.5 or True:
|
||||
| print(text)
|
||||
|
||||
p We'll build the filter as we go, simply editing the script in place. However, we don't want to disconnect and reconnect from Twitter on every change. To do this, we'll just put our match function in a different file, and reload the module every few seconds. It's probably not a UI you'd deliver to someone else, but so what? Data exploration is between you and your terminal.
|
||||
|
||||
details
|
||||
summary: h4 twitter_filter.py (complete script)
|
||||
|
||||
pre.language-python: code
|
||||
| # encoding: utf8
|
||||
| from __future__ import unicode_literals, print_function
|
||||
| import plac
|
||||
| import codecs
|
||||
| import pathlib
|
||||
| import random
|
||||
|
|
||||
| import twython
|
||||
| import spacy.en
|
||||
|
|
||||
| import _handler
|
||||
|
|
||||
|
|
||||
| class Connection(twython.TwythonStreamer):
|
||||
| def __init__(self, keys_dir, nlp, query):
|
||||
| keys_dir = pathlib.Path(keys_dir)
|
||||
| read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip()
|
||||
| api_key = map(read, ['key', 'secret', 'token', 'token_secret'])
|
||||
| twython.TwythonStreamer.__init__(self, *api_key)
|
||||
| self.nlp = nlp
|
||||
| self.query = query
|
||||
|
|
||||
| def on_success(self, data):
|
||||
| _handler.handle_tweet(self.nlp, data, self.query)
|
||||
| if random.random() >= 0.1:
|
||||
| reload(_handler)
|
||||
|
|
||||
|
|
||||
| def main(keys_dir, term):
|
||||
| nlp = spacy.en.English()
|
||||
| twitter = Connection(keys_dir, nlp, term)
|
||||
| twitter.statuses.filter(track=term, language='en')
|
||||
|
|
||||
|
|
||||
| if __name__ == '__main__':
|
||||
| plac.call(main)
|
||||
|
||||
details
|
||||
summary: h4 _handler.py (complete script)
|
||||
|
||||
pre.language-python
|
||||
code
|
||||
| from __future__ import unicode_literals, print_function
|
||||
|
|
||||
| from math import sqrt
|
||||
| from numpy import dot
|
||||
| from numpy.linalg import norm
|
||||
|
|
||||
|
|
||||
| def handle_tweet(spacy, tweet_data, query):
|
||||
| text = tweet_data.get('text', '')
|
||||
| match_tweet(spacy, text.decode('utf8'), query)
|
||||
|
|
||||
|
|
||||
| def match_tweet(spacy, text, query):
|
||||
| def get_vector(word):
|
||||
| return spacy.vocab[word].repvec
|
||||
|
|
||||
| tweet = spacy(text)
|
||||
| tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query]
|
||||
| if tweet:
|
||||
| accept = map(get_vector, 'Jeb Cheney Republican 9/11h'.split())
|
||||
| reject = map(get_vector, 'garden Reggie hairy'.split())
|
||||
| y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept)
|
||||
| n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject)
|
||||
|
|
||||
| if (y / (y + n)) >= 0.5 or True:
|
||||
| print(text)
|
||||
|
|
||||
|
|
||||
| def cos(v1, v2):
|
||||
| return dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
|
||||
p Below, I go through the script in execution-order.
|
||||
|
||||
details(open=true)
|
||||
summary
|
||||
h4 Command-line interface and main()
|
||||
|
||||
p I always use the plac library for command-line arguments, as I think it "scales down" the best. This just wraps the main functon, so that the two arguments of main are positional arguments on the command-line.
|
||||
|
||||
p We accept a directory that stores the twitter keys, and a search term, which we live-filter twitter for. We build a spaCy instance, and pass it to the connection. From here, #[code Connection.on_success] is called with each tweet – so that's where the rest of our logic lives.
|
||||
|
||||
pre.language-python: code
|
||||
| def main(keys_dir, term):
|
||||
| nlp = spacy.en.English()
|
||||
| twitter = Connection(keys_dir, nlp, term)
|
||||
| twitter.statuses.filter(track=term, language='en')
|
||||
|
|
||||
|
|
||||
| if __name__ == '__main__':
|
||||
| plac.call(main)
|
||||
|
||||
details(open=true)
|
||||
summary: h4 The event loop
|
||||
|
||||
p We read our key files off disk, and connect to Twitter. In the #[code on_success()] method, which is called with each tweet, we reload the handler 10% of the time. This way we can live-edit the handler, without constantly reconnecting to Twitter.
|
||||
|
||||
pre.language-python: code
|
||||
| class Connection(twython.TwythonStreamer):
|
||||
| def __init__(self, keys_dir, nlp, query):
|
||||
| keys_dir = pathlib.Path(keys_dir)
|
||||
| read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip()
|
||||
| api_key = map(read, ['key', 'secret', 'token', 'token_secret'])
|
||||
| twython.TwythonStreamer.__init__(self, *api_key)
|
||||
| self.nlp = nlp
|
||||
| self.query = query
|
||||
|
|
||||
| def on_success(self, data):
|
||||
| _handler.handle_tweet(self.nlp, data, self.query)
|
||||
| if random.random() >= 0.1:
|
||||
| reload(_handler)
|
||||
|
||||
|
||||
details(open=true)
|
||||
summary: h4 The part you live edit
|
||||
|
||||
p We can now use spaCy on the live Twitter feed. In the example below I sketch out a simple word-vector based tweet filter. The idea here is to be a bit more flexible than simple keyword filters, as we can turn the strictness of the filter up or down with the constant given, and we can specify exclusion terms to filter out distractor topics as they arise.
|
||||
|
||||
p Instead of averaging the vectors in the tweet, I've set it to sum the values above zero, taking a little inspiration from what works well in neural networks. This is just an idea, to show that we can really come up with arbtrary logic quite quickly here.
|
||||
pre.language-python: code
|
||||
| def handle_tweet(spacy, resp, query):
|
||||
| def get_vector(word):
|
||||
| return spacy.vocab[word].repvec
|
||||
|
|
||||
| text = resp.get('text', '').decode('utf8')
|
||||
| tweet = spacy(text)
|
||||
| tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query]
|
||||
| if tweet:
|
||||
| accept = map(get_vector, 'Jeb Cheney Republican 9/11h'.split())
|
||||
| reject = map(get_vector, 'garden Reggie hairy'.split())
|
||||
|
|
||||
| y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept)
|
||||
| n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject)
|
||||
|
|
||||
| if (y / (y + n)) >= 0.5 or True:
|
||||
| print(text)
|
||||
|
|
||||
|
|
||||
| def cos(v1, v2):
|
||||
| return dot(v1, v2) / (norm(v1) * norm(v2))
|
8
website/src/jade/tutorials/twitter-filter/meta.jade
Normal file
8
website/src/jade/tutorials/twitter-filter/meta.jade
Normal file
|
@ -0,0 +1,8 @@
|
|||
- var Meta = {}
|
||||
- Meta.headline = "Tutorial: Finding Relevant Tweets"
|
||||
- Meta.author_id = 'matt'
|
||||
- Meta.description = "In this tutorial, we will use word vectors to search for tweets about Jeb Bush. We'll do this by building up two word lists: one that represents the type of meanings in the Jeb Bush tweets, and another to help screen out irrelevant tweets that mention the common, ambiguous word 'bush'."
|
||||
- Meta.date = "2015-08-18"
|
||||
- Meta.url = "/tutorials/twitter-filter"
|
||||
- Meta.active = { "blog": true }
|
||||
- Meta.links = []
|
246
website/src/js/displacy/displacy.js
Normal file
246
website/src/js/displacy/displacy.js
Normal file
|
@ -0,0 +1,246 @@
|
|||
var container = document.getElementById("displacy");
|
||||
var dp = [];
|
||||
|
||||
var displaCy = function(mode, api, query, call) {
|
||||
if(mode == "manual" && !call) call = query + "/";
|
||||
var request = call || query;
|
||||
if(mode == "steps") call = 0;
|
||||
|
||||
dp.loadingIndicator();
|
||||
|
||||
var xhr = new XMLHttpRequest();
|
||||
xhr.open( "POST", api, true);
|
||||
xhr.setRequestHeader("Content-type", "text/plain");
|
||||
xhr.onreadystatechange = function(data) {
|
||||
if (xhr.readyState === 4) {
|
||||
if (xhr.status === 200) {
|
||||
var result = JSON.parse(xhr.responseText);
|
||||
dp.setDisplay(mode, api, query, call, result);
|
||||
dp.loadingIndicator(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
xhr.send(JSON.stringify({ text: request}));
|
||||
}
|
||||
|
||||
dp.setDisplay = function(mode, api, query, call, result) {
|
||||
var state = (typeof call === "number") ? call : 0;
|
||||
var wordlist = result.words;
|
||||
var arrowlist = result.states[state].arrows;
|
||||
var stacklist = result.states[state].stack;
|
||||
var focus = result.states[state].focus;
|
||||
var is_final = result.states[state].is_final;
|
||||
var actionlist = result.actions;
|
||||
|
||||
var classnames = {
|
||||
words: { "NO_TAG" : "w-notag", "ADJ" : "w-adj", "ADV" : "w-adv", "ADP" : "w-adp", "DET" : "w-det", "NOUN" : "w-noun", "PRON" : "w-pron", "PRT" : "w-prt", "VERB" : "w-verb", "X" : "w-x", "PUNCT" : "w-punct", "EOL" : "w-eol", "SPACE" : "w-space", "on_stack" : "stack", "is_entity" : "w-ent", "low_prob" : "w-low", "in_focus" : "in-focus"
|
||||
},
|
||||
arrows : { "correct_arc" : "correct", "incorrect_arc" : "incorrect" }
|
||||
}
|
||||
|
||||
container.scrollLeft = 0;
|
||||
dp.clearDisplay();
|
||||
dp.addCss(arrowlist, wordlist);
|
||||
dp.addArrows(arrowlist);
|
||||
dp.addWords(wordlist, classnames.words, focus, stacklist);
|
||||
dp.setFocus(focus, arrowlist, wordlist, stacklist);
|
||||
|
||||
if(mode == "steps") dp.addActions(actionlist, is_final, mode, api, query, call, result);
|
||||
if(mode == "manual") dp.addActions(actionlist, is_final, mode, api, query, call);
|
||||
}
|
||||
|
||||
dp.clearDisplay = function() {
|
||||
document.getElementById("displacy").innerHTML = "";
|
||||
}
|
||||
|
||||
dp.clearActions = function() {
|
||||
var actions = document.getElementById("actions");
|
||||
if(actions != null) actions.innerHTML = "";
|
||||
}
|
||||
|
||||
dp.loadingIndicator = function(loading) {
|
||||
var spinner = dp.element("div", "spinner", "spinner", false);
|
||||
container.appendChild(spinner);
|
||||
|
||||
if(!loading) {
|
||||
document.getElementById("spinner").style.visibility = "hidden";
|
||||
}
|
||||
}
|
||||
|
||||
dp.calcSize = function(arrowlist) {
|
||||
var size = { height: "350", width: "175", spacing: "10", unit: "px" }
|
||||
if(arrowlist.length <= 3) size.height /= 2.75;
|
||||
if(arrowlist.length > 12) {
|
||||
size.width *= 1.15;
|
||||
size.height *=1.25;
|
||||
}
|
||||
if(arrowlist.length > 20) {
|
||||
size.width *=1.25;
|
||||
size.height *=1.5;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
dp.addCss = function(arrowlist, wordlist) {
|
||||
var size = dp.calcSize(arrowlist);
|
||||
|
||||
var css = {
|
||||
height: size.height + size.unit,
|
||||
width: size.width + size.unit,
|
||||
spacing: size.spacing + size.unit
|
||||
}
|
||||
|
||||
var stylesheet = dp.element("style", false, false, ["scoped", "true"]);
|
||||
var styles = ["#displacy *,#displacy *:before,#displacy *:after{box-sizing:border-box}#displacy{position:relative;overflow:scroll}#displacy .focus{position:absolute;top:0;height:100%;z-index:-1;background:rgba(0,0,0,.25)}#displacy .current-stack{margin:6em 1.5em;font-size:.75em;opacity:.25}#displacy .actions{position:fixed;}#displacy .words{display:flex;display:-webkit-flex;display:-ms-flexbox;display:-webkit-box;flex-flow:row nowrap;overflow:hidden;text-align:center}#displacy .words .word:after{content:attr(title);display:block}#displacy .arrows{width:100%;position:relative}.level{position:absolute;bottom:0;width:100%}#displacy .arrow{height:100%;position:absolute;overflow:hidden}#displacy .arrow:before{content:attr(title);text-align:center;display:block;height:200%;border-radius:50%;border:2px solid;margin:0 auto}#displacy .arrow:after{content:'';width:0;height:0;position:absolute;bottom:-1px;border-top:12px solid;border-left:6px solid transparent;border-right:6px solid transparent}#displacy .arrow.null{display:none}"];
|
||||
|
||||
for(var i = 1; i <= arrowlist.length; i++) {
|
||||
var level = ".level" + i;
|
||||
|
||||
styles.push("#displacy " + level + "{height:" + parseInt(100/arrowlist.length * i) + "%}#displacy " + level + " .arrow{width:calc(" + css.width + " * " + i + ")}#displacy " + level + " .arrow:before{width:calc(100% - " + css.spacing + " * " + parseInt(arrowlist.length - i) + " - 10px)}#displacy " + level + " .arrow.left:after{left:calc(" + css.spacing + " * " + (arrowlist.length - i)/2 + ")}#displacy " + level + " .arrow.right:after{right:calc(" + css.spacing + " * " + (arrowlist.length - i)/2 + ")}");
|
||||
}
|
||||
|
||||
for(i = 1; i < wordlist.length; i++) {
|
||||
styles.push("#displacy .level .arrow:nth-child(" + i + "){left:calc(" + css.width + " * " + parseInt(i - 1) + ")}#displacy .arrows{height:" + css.height + "}#displacy .level{left:calc(" + css.width + "/2)}");
|
||||
}
|
||||
|
||||
styles.push("#displacy .words{min-width:calc(" + css.width + " * " + wordlist.length + ")}.words .word{width:" + css.width + "}")
|
||||
|
||||
stylesheet.appendChild(document.createTextNode(styles.join(' ')));
|
||||
container.appendChild(stylesheet);
|
||||
}
|
||||
|
||||
dp.addArrows = function(arrowlist) {
|
||||
var arrowContainer = dp.element("div", "arrows");
|
||||
|
||||
for(var i = 0; i < arrowlist.length; i++) {
|
||||
var level = dp.element("div", "level level" + (i + 1));
|
||||
|
||||
for(var j = 0; j < arrowlist[i].length; j++) {
|
||||
var arrow = dp.element("span");
|
||||
|
||||
if(arrowlist[i][j] !== null) {
|
||||
arrow.setAttribute("title", arrowlist[i][j].label);
|
||||
arrow.className = "arrow " + arrowlist[i][j].dir;
|
||||
}
|
||||
else {
|
||||
arrow.className = "arrow null";
|
||||
}
|
||||
level.appendChild(arrow);
|
||||
}
|
||||
arrowContainer.appendChild(level);
|
||||
}
|
||||
container.appendChild(arrowContainer);
|
||||
}
|
||||
|
||||
dp.addWords = function(wordlist, classnames, focus, stacklist) {
|
||||
var wordContainer = dp.element("div", "words");
|
||||
|
||||
for(i = 0; i < wordlist.length; i++) {
|
||||
var classes = [ "word" ];
|
||||
var current = wordlist[i];
|
||||
var tag = current.tag;
|
||||
|
||||
var word = dp.element("div", false, false, ["title", tag]);
|
||||
classes.push(classnames[tag]);
|
||||
|
||||
if(i === focus) classes.push(classnames["in_focus"]);
|
||||
if(stacklist[i]) classes.push(classnames["on_stack"]);
|
||||
if(current.is_entity) classes.push(classnames["is_entity"]);
|
||||
if(!current.is_entity && current.prob <= -17) classes.push(classnames["low_prob"]);
|
||||
|
||||
word.className = classes.join(" ");
|
||||
var wordtext = dp.element("span", false, false, false, wordlist[i].word);
|
||||
word.appendChild(wordtext);
|
||||
wordContainer.appendChild(word);
|
||||
}
|
||||
container.appendChild(wordContainer);
|
||||
}
|
||||
|
||||
dp.setFocus = function(focus, arrowlist, wordlist, stacklist) {
|
||||
var size = dp.calcSize(arrowlist);
|
||||
|
||||
var focusContainer = dp.element("div", "focus", "focus");
|
||||
focusContainer.style.width = size.width + size.unit;
|
||||
focusContainer.style.left = size.width * focus + size.unit;
|
||||
|
||||
focusContainer.appendChild(dp.compileStack(wordlist, stacklist));
|
||||
container.appendChild(focusContainer);
|
||||
|
||||
if(size.width * focus - container.scrollLeft > container.clientWidth/2) container.scrollLeft = size.width * focus - container.clientWidth/2 + size.width/2;
|
||||
}
|
||||
|
||||
dp.compileStack = function(wordlist, stacklist) {
|
||||
var stack = dp.element("div", "current-stack", false, ["title", "Stack"]);
|
||||
|
||||
for(var i in wordlist) {
|
||||
if(stacklist[i]) {
|
||||
var word = dp.element("div", false, false, false, wordlist[i].word);
|
||||
stack.appendChild(word);
|
||||
}
|
||||
}
|
||||
return stack;
|
||||
}
|
||||
|
||||
dp.addActions = function(actionlist, is_final, mode, api, query, call, result) {
|
||||
dp.clearActions();
|
||||
var bindings = [];
|
||||
var actionContainer = dp.element("div", "actions", "actions");
|
||||
|
||||
for(var i in actionlist) {
|
||||
var button = dp.element("button", actionlist[i].label, false, false, actionlist[i].label);
|
||||
button.onclick = dp.performAction(mode, api, query, call, actionlist[i].key, result);
|
||||
|
||||
if(actionlist[i].is_valid && !is_final) bindings.push({
|
||||
key: actionlist[i].key,
|
||||
code: actionlist[i].binding,
|
||||
action: button.onclick
|
||||
});
|
||||
else button.disabled = true;
|
||||
|
||||
actionContainer.appendChild(button);
|
||||
}
|
||||
container.appendChild(actionContainer);
|
||||
|
||||
document.onkeydown = function(event) {
|
||||
if ('input' != event.target.tagName.toLowerCase()) {
|
||||
var codes = [];
|
||||
for(i in bindings) {
|
||||
if(event.keyCode == bindings[i].code) {
|
||||
bindings[i].action();
|
||||
}
|
||||
codes.push(bindings[i].code);
|
||||
}
|
||||
|
||||
if(codes.indexOf(event.keyCode)!=-1) return false;
|
||||
}
|
||||
}
|
||||
|
||||
if(is_final) container.scrollLeft = 0;
|
||||
}
|
||||
|
||||
dp.performAction = function(mode, api, query, call, action, result) {
|
||||
if(mode == "parse" || mode == "manual") {
|
||||
return function() {
|
||||
call += action + ",";
|
||||
displaCy(mode, api, query, call);
|
||||
}
|
||||
}
|
||||
|
||||
if(mode == "steps") {
|
||||
return function() {
|
||||
if(action == "N") call++;
|
||||
else if(action == "P" && call > 0) call--;
|
||||
else call = 0;
|
||||
dp.setDisplay(mode, api, query, call, result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dp.element = function(tag, classname, id, attribute, content) {
|
||||
var element = document.createElement(tag);
|
||||
element.className = classname || "";
|
||||
if(id) element.id = id;
|
||||
if(attribute) element.setAttribute(attribute[0], attribute[1]);
|
||||
if(content) element.appendChild(document.createTextNode(content));
|
||||
return element;
|
||||
}
|
34
website/src/sass/_colors.sass
Normal file
34
website/src/sass/_colors.sass
Normal file
|
@ -0,0 +1,34 @@
|
|||
// Page
|
||||
$c-text: black
|
||||
$c-bg: white
|
||||
$c-medium: #a7aaa2
|
||||
$c-light: lighten($c-medium, 32.5%)
|
||||
$c-bad: #e80037
|
||||
$c-good: #00cc3a
|
||||
$c-highlight: #ffa400
|
||||
$c-lowlight: $c-medium
|
||||
|
||||
// Sections
|
||||
$c-page: #009acc
|
||||
$c-blog: #f25f5c
|
||||
|
||||
// Social
|
||||
$c-twitter: #5ea9dd
|
||||
$c-reddit: #ff4500
|
||||
$c-hn: #ff6600
|
||||
|
||||
// Prism
|
||||
$prism-bg: #272822
|
||||
$prism-text: #f8f8f2
|
||||
$prism-punct: #999999
|
||||
$prism-comment: slategray
|
||||
$prism-number: #ae81ff
|
||||
$prism-selector: #a6e22e
|
||||
$prism-operator: #f92672
|
||||
$prism-keyword: #66d9ef
|
||||
$prism-regex: #e6db74
|
||||
$prism-tag: #f92672
|
||||
$prism-value: #fd971f
|
||||
|
||||
//$prism-punct: #f8f8f2
|
||||
//$prism-operator: #f8f8f2
|
101
website/src/sass/_fonts.sass
Normal file
101
website/src/sass/_fonts.sass
Normal file
|
@ -0,0 +1,101 @@
|
|||
// Font import
|
||||
@font-face
|
||||
font-family: 'Karla'
|
||||
src: url('../fonts/karla-regular.eot')
|
||||
src: url('../fonts/karla-regular.eot?#iefix') format('embedded-opentype'), url('../fonts/karla-regular.woff2') format('woff2'), url('../fonts/karla-regular.woff') format('woff'), url('../fonts/karla-regular.ttf') format('truetype'), url('../fonts/karla-regular.svg#karlaregular') format('svg')
|
||||
font-weight: 400
|
||||
font-style: normal
|
||||
unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF
|
||||
|
||||
@font-face
|
||||
font-family: 'Karla'
|
||||
src: url('../fonts/karla-regular.eot')
|
||||
src: url('../fonts/karla-regular.eot?#iefix') format('embedded-opentype'), url('../fonts/karla-regular.woff2') format('woff2'), url('../fonts/karla-regular.woff') format('woff'), url('../fonts/karla-regular.ttf') format('truetype'), url('../fonts/karla-regular.svg#karlaregular') format('svg')
|
||||
font-weight: 400
|
||||
font-style: normal
|
||||
unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000
|
||||
|
||||
@font-face
|
||||
font-family: 'Karla'
|
||||
src: url('../fonts/karla-italic.eot')
|
||||
src: url('../fonts/karla-italic.eot?#iefix') format('embedded-opentype'), url('../fonts/karla-italic.woff2') format('woff2'), url('../fonts/karla-italic.woff') format('woff'), url('../fonts/karla-italic.ttf') format('truetype'), url('../fonts/karla-italic.svg#karlaitalic') format('svg')
|
||||
font-weight: 400
|
||||
font-style: italic
|
||||
unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF
|
||||
|
||||
@font-face
|
||||
font-family: 'Karla'
|
||||
src: url('../fonts/karla-italic.eot')
|
||||
src: url('../fonts/karla-italic.eot?#iefix') format('embedded-opentype'), url('../fonts/karla-italic.woff2') format('woff2'), url('../fonts/karla-italic.woff') format('woff'), url('../fonts/karla-italic.ttf') format('truetype'), url('../fonts/karla-italic.svg#karlaitalic') format('svg')
|
||||
font-weight: 400
|
||||
font-style: italic
|
||||
unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000
|
||||
|
||||
@font-face
|
||||
font-family: 'Karla'
|
||||
src: url('../fonts/karla-bold.eot')
|
||||
src: url('../fonts/karla-bold.eot?#iefix') format('embedded-opentype'), url('../fonts/karla-bold.woff2') format('woff2'), url('../fonts/karla-bold.woff') format('woff'), url('../fonts/karla-bold.ttf') format('truetype'), url('../fonts/karla-bold.svg#karlabold') format('svg')
|
||||
font-weight: 700
|
||||
font-style: normal
|
||||
unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF
|
||||
|
||||
@font-face
|
||||
font-family: 'Karla'
|
||||
src: url('../fonts/karla-bold.eot')
|
||||
src: url('../fonts/karla-bold.eot?#iefix') format('embedded-opentype'), url('../fonts/karla-bold.woff2') format('woff2'), url('../fonts/karla-bold.woff') format('woff'), url('../fonts/karla-bold.ttf') format('truetype'), url('../fonts/karla-bold.svg#karlabold') format('svg')
|
||||
font-weight: 700
|
||||
font-style: normal
|
||||
unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000
|
||||
|
||||
@font-face
|
||||
font-family: 'Karla'
|
||||
src: url('../fonts/karla-bolditalic.eot')
|
||||
src: url('../fonts/karla-bolditalic.eot?#iefix') format('embedded-opentype'), url('../fonts/karla-bolditalic.woff2') format('woff2'), url('../fonts/karla-bolditalic.woff') format('woff'), url('../fonts/karla-bolditalic.ttf') format('truetype'), url('../fonts/karla-bolditalic.svg#karlabolditalic') format('svg')
|
||||
font-weight: 700
|
||||
font-style: italic
|
||||
unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF
|
||||
|
||||
@font-face
|
||||
font-family: 'Karla'
|
||||
src: url('../fonts/karla-bolditalic.eot')
|
||||
src: url('../fonts/karla-bolditalic.eot?#iefix') format('embedded-opentype'), url('../fonts/karla-bolditalic.woff2') format('woff2'), url('../fonts/karla-bolditalic.woff') format('woff'), url('../fonts/karla-bolditalic.ttf') format('truetype'), url('../fonts/karla-bolditalic.svg#karlabolditalic') format('svg')
|
||||
font-weight: 700
|
||||
font-style: italic
|
||||
unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000
|
||||
|
||||
@font-face
|
||||
font-family: 'Inconsolata'
|
||||
src: url('../fonts/inconsolata-regular.eot')
|
||||
src: url('../fonts/inconsolata-regular.eot?#iefix') format('embedded-opentype'), url('../fonts/inconsolata-regular.woff2') format('woff2'), url('../fonts/inconsolata-regular.woff') format('woff'), url('../fonts/inconsolata-regular.ttf') format('truetype'), url('../fonts/inconsolata-regular.svg#inconsolataregular') format('svg')
|
||||
font-weight: 400
|
||||
font-style: normal
|
||||
unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF
|
||||
|
||||
@font-face
|
||||
font-family: 'Inconsolata'
|
||||
src: url('../fonts/inconsolata-regular.eot')
|
||||
src: url('../fonts/inconsolata-regular.eot?#iefix') format('embedded-opentype'), url('../fonts/inconsolata-regular.woff2') format('woff2'), url('../fonts/inconsolata-regular.woff') format('woff'), url('../fonts/inconsolata-regular.ttf') format('truetype'), url('../fonts/inconsolata-regular.svg#inconsolataregular') format('svg')
|
||||
font-weight: 400
|
||||
font-style: normal
|
||||
unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000
|
||||
|
||||
@font-face
|
||||
font-family: 'Inconsolata'
|
||||
src: url('../fonts/inconsolata-bold.eot')
|
||||
src: url('../fonts/inconsolata-bold.eot?#iefix') format('embedded-opentype'), url('../fonts/inconsolata-bold.woff2') format('woff2'), url('../fonts/inconsolata-bold.woff') format('woff'), url('../fonts/inconsolata-bold.ttf') format('truetype'), url('../fonts/inconsolata-bold.svg#inconsolatabold') format('svg')
|
||||
font-weight: 700
|
||||
font-style: normal
|
||||
unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF
|
||||
|
||||
@font-face
|
||||
font-family: 'Inconsolata'
|
||||
src: url('../fonts/inconsolata-bold.eot')
|
||||
src: url('../fonts/inconsolata-bold.eot?#iefix') format('embedded-opentype'), url('../fonts/inconsolata-bold.woff2') format('woff2'), url('../fonts/inconsolata-bold.woff') format('woff'), url('../fonts/inconsolata-bold.ttf') format('truetype'), url('../fonts/inconsolata-bold.svg#inconsolatabold') format('svg')
|
||||
font-weight: 700
|
||||
font-style: normal
|
||||
unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000
|
||||
|
||||
// Font families
|
||||
$ff-regular: Georgia, 'Times New Roman', serif
|
||||
$ff-display: 'Karla', Arial, sans-serif
|
||||
$ff-code: 'Inconsolata', monospace
|
45
website/src/sass/_mixins.sass
Normal file
45
website/src/sass/_mixins.sass
Normal file
|
@ -0,0 +1,45 @@
|
|||
@mixin colors($background, $foreground)
|
||||
@if $background != "inherit"
|
||||
background-color: #{$background}
|
||||
@if $foreground != "inherit"
|
||||
color: #{$foreground}
|
||||
|
||||
@mixin vendor($name, $argument)
|
||||
-webkit-#{$name}: #{$argument}
|
||||
-ms-#{$name}: #{$argument}
|
||||
-moz-#{$name}: #{$argument}
|
||||
-o-#{$name}: #{$argument}
|
||||
#{$name}: #{$argument}
|
||||
|
||||
@mixin position($type, $position, $direction, $positionval, $directionval)
|
||||
position: #{$type}
|
||||
#{$position}: #{$positionval}
|
||||
#{$direction}: #{$directionval}
|
||||
|
||||
@mixin spacing($margin, $padding)
|
||||
@if $margin != "inherit"
|
||||
margin: #{$margin}
|
||||
@if $padding != "inherit"
|
||||
padding: #{$padding}
|
||||
|
||||
@mixin font-size($font-size, $line-height)
|
||||
@if $font-size != "inherit"
|
||||
font-size: #{$font-size}
|
||||
|
||||
@if $line-height != "inherit"
|
||||
line-height: #{$line-height}
|
||||
|
||||
@mixin size($width, $height: $width)
|
||||
@if $width != "auto"
|
||||
width: $width
|
||||
|
||||
@if $height != "auto"
|
||||
height: $height
|
||||
|
||||
@mixin font($font, $text-transform, $text-align)
|
||||
font: $font
|
||||
|
||||
@if $text-align != "inherit"
|
||||
text-align: $text-align
|
||||
@if $text-transform != "inherit"
|
||||
text-transform: $text-transform
|
94
website/src/sass/_prism.sass
Normal file
94
website/src/sass/_prism.sass
Normal file
|
@ -0,0 +1,94 @@
|
|||
code,
|
||||
pre
|
||||
@include font(bold 1rem/1.5em $ff-code, inherit, inherit)
|
||||
@include vendor(tab-size, 4)
|
||||
@include vendor(hyphens, none)
|
||||
direction: ltr
|
||||
white-space: pre
|
||||
border: none
|
||||
word:
|
||||
spacing: normal
|
||||
break: normal
|
||||
|
||||
pre
|
||||
@include spacing(0 0 2em 0, 2em)
|
||||
@include colors($prism-bg, $prism-text)
|
||||
overflow: auto
|
||||
text-shadow: 0 1px rgba(0, 0, 0, 0.3)
|
||||
|
||||
*:not(pre)
|
||||
> code
|
||||
@include spacing(0 .25em, 0 .5em)
|
||||
display: inline-block
|
||||
border-radius: .2em
|
||||
|
||||
&[class*="language-"]
|
||||
@include colors($prism-bg, $prism-text)
|
||||
|
||||
&:not([class*="language-"])
|
||||
border: 1px solid lighten($c-medium, 10%)
|
||||
|
||||
.declaration code
|
||||
background: transparent
|
||||
border: none !important
|
||||
|
||||
.token.comment,
|
||||
.token.prolog,
|
||||
.token.doctype,
|
||||
.token.cdata
|
||||
color: $prism-comment
|
||||
|
||||
.token.punctuation
|
||||
color: $prism-punct
|
||||
|
||||
.namespace
|
||||
opacity: .7
|
||||
|
||||
.token.property,
|
||||
.token.tag,
|
||||
.token.constant,
|
||||
.token.symbol,
|
||||
.token.deleted
|
||||
color: $prism-tag
|
||||
|
||||
.token.boolean,
|
||||
.token.number
|
||||
color: $prism-number
|
||||
|
||||
.token.selector,
|
||||
.token.attr-name,
|
||||
.token.string,
|
||||
.token.char,
|
||||
.token.builtin,
|
||||
.token.inserted
|
||||
color: $prism-selector
|
||||
|
||||
.token.operator,
|
||||
.token.entity,
|
||||
.token.url,
|
||||
.language-css .token.string,
|
||||
.style .token.string,
|
||||
.token.variable
|
||||
color: $prism-operator
|
||||
|
||||
.token.atrule,
|
||||
.token.attr-value,
|
||||
.token.function
|
||||
color: $prism-value
|
||||
|
||||
.token.keyword
|
||||
color: $prism-keyword
|
||||
|
||||
.token.regex,
|
||||
.token.important
|
||||
color: $prism-regex
|
||||
|
||||
.token.important,
|
||||
.token.bold
|
||||
font-weight: bold
|
||||
|
||||
.token.italic
|
||||
font-style: italic
|
||||
|
||||
.token.entity
|
||||
cursor: help
|
781
website/src/sass/style.sass
Normal file
781
website/src/sass/style.sass
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user