From 29da87d9720721d5b0edb789253397b943802ccf Mon Sep 17 00:00:00 2001
From: Henning Peters
Date: Sun, 15 Nov 2015 14:57:57 +0100
Subject: [PATCH 01/13] push version
---
website/src/jade/home/index.jade | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/website/src/jade/home/index.jade b/website/src/jade/home/index.jade
index cbf5d9255..206a7777f 100644
--- a/website/src/jade/home/index.jade
+++ b/website/src/jade/home/index.jade
@@ -29,10 +29,10 @@ include ../header.jade
li: a.button(href="#example-use") Examples
li: a.button(href="#install")
| Install
- v0.97
+ v0.99
article.page.landing-page
+Section("Comparisons and Benchmarks", "comparisons", "./_comparisons.jade")
+Section("Online Demo", "online-demo", "./_online_demo.jade")
+Section("Usage by Example", "example-use", "./_usage_examples.jade")
- +Section("Install v0.97", "install", "./_installation.jade")
+ +Section("Install v0.99", "install", "./_installation.jade")
From b1e8905ef4ed63d4cda84c0a4a17e799690fcb59 Mon Sep 17 00:00:00 2001
From: Henning Peters
Date: Thu, 19 Nov 2015 13:22:56 +0100
Subject: [PATCH 02/13] fix website build
---
fabfile.py | 12 ++++++++----
website/README.md | 9 ++++-----
website/create_code_samples | 2 +-
3 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/fabfile.py b/fabfile.py
index 22dadac11..df214d93e 100644
--- a/fabfile.py
+++ b/fabfile.py
@@ -60,7 +60,7 @@ def prebuild(build_dir='/tmp/build_spacy'):
local('py.test --models spacy/tests/')
-def docs():
+def web():
def jade(source_name, out_dir):
pwd = path.join(path.dirname(__file__), 'website')
jade_loc = path.join(pwd, 'src', 'jade', source_name)
@@ -68,7 +68,7 @@ def docs():
local('jade -P %s --out %s' % (jade_loc, out_loc))
with virtualenv(VENV_DIR):
- local('./website/create_code_samples tests/website/ website/src/code/')
+ local('./website/create_code_samples spacy/tests/website/ website/src/code/')
jade('home/index.jade', '')
jade('docs/index.jade', 'docs/')
@@ -79,8 +79,12 @@ def docs():
if post_dir.is_dir() \
and (post_dir / 'index.jade').exists() \
and (post_dir / 'meta.jade').exists():
- jade(str(post_dir / 'index.jade'), path.join('blogs', post_dir.parts[-1]))
-
+ jade(str(post_dir / 'index.jade'), path.join('blog', post_dir.parts[-1]))
+
+
+def web_publish():
+ local('aws s3 sync --delete website/site/ s3://spacy.io')
+
def publish(version):
with virtualenv(VENV_DIR):
diff --git a/website/README.md b/website/README.md
index efc3cd96c..755d1cba2 100644
--- a/website/README.md
+++ b/website/README.md
@@ -15,7 +15,7 @@ The Stack
--------
The site is built with the [Jade](http://jade-lang.com/) template language.
-See [the Makefile](Makefile) for more
+See [fabfile.py](/fabfile.py) under ```web()``` for more
Developing
@@ -23,8 +23,7 @@ Developing
To make and test changes
```
npm install jade --global
- cd website
- make
- python -m SimpleHTTPServer 8000
+ fab web
+ python -m SimpleHTTPServer 8000 website/site
```
-Then visit [localhost:8000/src/...](http://localhost:8000/src/)
+Then visit [localhost:8000](http://localhost:8000)
diff --git a/website/create_code_samples b/website/create_code_samples
index 2b9938edb..75882c24b 100755
--- a/website/create_code_samples
+++ b/website/create_code_samples
@@ -26,7 +26,7 @@ def main(src_dirname, dst_dirname):
continue
# Remove test_ prefix and .py suffix
- name = filename[6:-3]
+ name = filename[5:-3]
with io.open(os.path.join(src_dirname, filename), 'r', encoding='utf8') as file_:
source = file_.readlines()
tree = ast.parse("".join(source))
From 4e98ea4e4171cf17af2f9789b3723eeddb23f610 Mon Sep 17 00:00:00 2001
From: Henning Peters
Date: Sat, 21 Nov 2015 19:04:57 +0100
Subject: [PATCH 03/13] bump version
---
requirements.txt | 2 +-
setup.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index ffa8664d5..344cc7665 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,4 @@ plac
six
ujson
cloudpickle
-sputnik == 0.5.1
+sputnik == 0.5.2
diff --git a/setup.py b/setup.py
index 0721524a3..46d9ee67d 100644
--- a/setup.py
+++ b/setup.py
@@ -179,7 +179,7 @@ def run_setup(exts):
license="MIT",
install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44',
'thinc == 4.0.0', "text_unidecode", 'plac', 'six',
- 'ujson', 'cloudpickle', 'sputnik == 0.5.1'],
+ 'ujson', 'cloudpickle', 'sputnik == 0.5.2'],
setup_requires=["headers_workaround"],
cmdclass = {'build_ext': build_ext_subclass },
)
From 129e6419c49b263df9d69a2880d5a1185f2bf6c2 Mon Sep 17 00:00:00 2001
From: Henning Peters
Date: Tue, 24 Nov 2015 14:46:48 +0100
Subject: [PATCH 04/13] adapt website urls, deploy website via fabric
---
fabfile.py | 7 ++++++-
website/src/jade/blog/displacy/index.jade | 6 +++---
.../src/jade/blog/eli5-computers-learn-reading/index.jade | 4 ++--
website/src/jade/header.jade | 4 ++--
website/src/jade/home/_online_demo.jade | 4 ++--
5 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/fabfile.py b/fabfile.py
index 22dadac11..c46240681 100644
--- a/fabfile.py
+++ b/fabfile.py
@@ -60,7 +60,7 @@ def prebuild(build_dir='/tmp/build_spacy'):
local('py.test --models spacy/tests/')
-def docs():
+def web():
def jade(source_name, out_dir):
pwd = path.join(path.dirname(__file__), 'website')
jade_loc = path.join(pwd, 'src', 'jade', source_name)
@@ -82,6 +82,11 @@ def docs():
jade(str(post_dir / 'index.jade'), path.join('blogs', post_dir.parts[-1]))
+def web_publish(assets_path):
+ local('aws s3 sync --delete website/site/ s3://spacy.io')
+ local('aws s3 sync --delete %s s3://spacy.io/resources' % assets_path)
+
+
def publish(version):
with virtualenv(VENV_DIR):
local('git push origin master')
diff --git a/website/src/jade/blog/displacy/index.jade b/website/src/jade/blog/displacy/index.jade
index f388a084d..cac4fae5b 100644
--- a/website/src/jade/blog/displacy/index.jade
+++ b/website/src/jade/blog/displacy/index.jade
@@ -2,7 +2,7 @@ include ../../header.jade
include ./meta.jade
mixin Displacy(sentence, caption_text, height)
- - var url = "/displacy/?full=" + sentence.replace(" ", "%20")
+ - var url = "http://api.spacy.io/displacy/?full=" + sentence.replace(" ", "%20")
.displacy
iframe.displacy(src="/resources/displacy/robots.html" height=height)
@@ -20,7 +20,7 @@ mixin Displacy(sentence, caption_text, height)
p A syntactic dependency parse is a kind of shallow meaning representation. It's an important piece of many language understanding and text processing technologies. Now that these representations can be computed quickly, and with increasingly high accuracy, they're being used in lots of applications – translation, sentiment analysis, and summarization are major application areas.
- p I've been living and breathing similar representations for most of my career. But there's always been a problem: talking about these things is tough. Most people haven't thought much about grammatical structure, and the idea of them is inherently abstract. When I left academia to write #[a(href="http://spaCy.io") spaCy], I knew I wanted a good visualizer. Unfortunately, I also knew I'd never be the one to write it. I'm deeply graphically challenged. Fortunately, when working with #[a(href="http://ines.io") Ines] to build this site, she really nailed the problem, with a solution I'd never have thought of. I really love the result, which we're calling #[a(href="/displacy") displaCy]:
+ p I've been living and breathing similar representations for most of my career. But there's always been a problem: talking about these things is tough. Most people haven't thought much about grammatical structure, and the idea of them is inherently abstract. When I left academia to write #[a(href="http://spaCy.io") spaCy], I knew I wanted a good visualizer. Unfortunately, I also knew I'd never be the one to write it. I'm deeply graphically challenged. Fortunately, when working with #[a(href="http://ines.io") Ines] to build this site, she really nailed the problem, with a solution I'd never have thought of. I really love the result, which we're calling #[a(href="http://api.spacy.io/displacy") displaCy]:
+Displacy("Robots in popular culture are there to remind us of the awesomeness of unbounded human agency", "Click the button to full-screen and interact, or scroll to see the full parse.", 325)
@@ -40,7 +40,7 @@ mixin Displacy(sentence, caption_text, height)
p To me, this seemed like witchcraft, or a hack at best. But I was quickly won over: if all we do is declare the data and the relationships, in standards-compliant HTML and CSS, then we can simply step back and let the browser do its job. We know the code will be small, the layout will work on a variety of display, and we'll have a ready separation of style and content. For long output, we simply let the graphic overflow, and let users scroll.
- p What I'm particularly excited about is the potential for displaCy as an #[a(href="http://spacy.io/displacy/?manual=Robots%20in%20popular%20culture%20are%20there%20to%20remind%20us%20of%20the%20awesomeness%20of%20unbounded%20human%20agency" target="_blank") annotation tool]. It may seem unintuitive at first, but I think it will be much better to annotate texts the way the parser operates, with a small set of actions and a stack, than by selecting arcs directly. Why? A few reasons:
+ p What I'm particularly excited about is the potential for displaCy as an #[a(href="http://api.spacy.io/displacy/?manual=Robots%20in%20popular%20culture%20are%20there%20to%20remind%20us%20of%20the%20awesomeness%20of%20unbounded%20human%20agency" target="_blank") annotation tool]. It may seem unintuitive at first, but I think it will be much better to annotate texts the way the parser operates, with a small set of actions and a stack, than by selecting arcs directly. Why? A few reasons:
ul
li You're always asked a question. You don't have to decide-what-to-decide.
diff --git a/website/src/jade/blog/eli5-computers-learn-reading/index.jade b/website/src/jade/blog/eli5-computers-learn-reading/index.jade
index 4f3e9ebb1..d0a130fe1 100644
--- a/website/src/jade/blog/eli5-computers-learn-reading/index.jade
+++ b/website/src/jade/blog/eli5-computers-learn-reading/index.jade
@@ -10,7 +10,7 @@ include ./meta.jade
p It turns out that almost anything we say could mean many many different things, but we don't notice because almost all of those meanings would be weird or stupid or just not possible. If I say:
- p.example #[a(href="http://spacy.io/displacy/?full=I%20saw%20a%20movie%20in%20a%20dress" target="_blank") I saw a movie in a dress]
+ p.example #[a(href="http://api.spacy.io/displacy/?full=I%20saw%20a%20movie%20in%20a%20dress" target="_blank") I saw a movie in a dress]
p Would you ever ask me,
@@ -18,7 +18,7 @@ include ./meta.jade
p It's weird to even think of that. But a computer just might, because there are other cases like:
- p.example #[a(href="http://spacy.io/displacy/?full=The%20TV%20showed%20a%20girl%20in%20a%20dress" target="_blank") The TV showed a girl in a dress]
+ p.example #[a(href="http://api.spacy.io/displacy/?full=The%20TV%20showed%20a%20girl%20in%20a%20dress" target="_blank") The TV showed a girl in a dress]
p Where the words hang together in the other way. People used to think that the answer was to tell the computer lots and lots of facts. But then you wake up one day and you're writing facts like #[em movies do not wear dresses], and you wonder where it all went wrong. Actually it's even worse than that. Not only are there too many facts, most of them are not even really facts! #[a(href="https://en.wikipedia.org/wiki/Cyc") People really tried this]. We've found that the world is made up of #[em if]s and #[em but]s.
diff --git a/website/src/jade/header.jade b/website/src/jade/header.jade
index 1a2bef214..e07ca01e0 100644
--- a/website/src/jade/header.jade
+++ b/website/src/jade/header.jade
@@ -99,7 +99,7 @@ mixin WritePage(Site, Author, Page)
meta(property="article:published_time" content=getDate(Page.date).timestamp)
link(rel="stylesheet" href="/resources/css/style.css")
- //[if lt IE 9]>
Date: Tue, 24 Nov 2015 19:33:29 +0100
Subject: [PATCH 05/13] fix merge
---
fabfile.py | 4 ----
1 file changed, 4 deletions(-)
diff --git a/fabfile.py b/fabfile.py
index 58ec73391..8af52f135 100644
--- a/fabfile.py
+++ b/fabfile.py
@@ -82,10 +82,6 @@ def web():
jade(str(post_dir / 'index.jade'), path.join('blog', post_dir.parts[-1]))
-def web_publish():
- local('aws s3 sync --delete website/site/ s3://spacy.io')
-
-
def web_publish(assets_path):
local('aws s3 sync --delete website/site/ s3://spacy.io')
local('aws s3 sync --delete %s s3://spacy.io/resources' % assets_path)
From abe6162e7b634c8ef33e5d54bc8a832ced9391ea Mon Sep 17 00:00:00 2001
From: Henning Peters
Date: Tue, 24 Nov 2015 20:01:43 +0100
Subject: [PATCH 06/13] avoid redirect
---
setup.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/setup.py b/setup.py
index 0721524a3..998776eda 100644
--- a/setup.py
+++ b/setup.py
@@ -144,7 +144,7 @@ def cython_setup(mod_names, language, includes):
author='Matthew Honnibal',
author_email='honnibal@gmail.com',
version=VERSION,
- url="http://honnibal.github.io/spaCy/",
+ url="http://spacy.io",
package_data=PACKAGE_DATA,
ext_modules=exts,
cmdclass={'build_ext': build_ext_cython_subclass},
From 89c1c72dd995ec258021b43c0772c1036dcd4bdd Mon Sep 17 00:00:00 2001
From: Henning Peters
Date: Tue, 24 Nov 2015 22:31:49 +0100
Subject: [PATCH 07/13] fix jade warnings, strip redundant html pages
---
website/README.md | 2 +-
website/src/jade/404.html | 67 --
.../dead-code-should-be-buried/index.html | 97 ---
.../blog/dead-code-should-be-buried/meta.html | 0
.../blog/parsing-english-in-python/index.html | 535 ----------------
.../blog/parsing-english-in-python/meta.html | 0
website/src/jade/docs/_api.html | 606 ------------------
website/src/jade/docs/_api.jade | 5 +-
website/src/jade/docs/_spec.jade | 3 +-
website/src/jade/home/_installation.jade | 7 +-
10 files changed, 7 insertions(+), 1315 deletions(-)
delete mode 100644 website/src/jade/404.html
delete mode 100644 website/src/jade/blog/dead-code-should-be-buried/index.html
delete mode 100644 website/src/jade/blog/dead-code-should-be-buried/meta.html
delete mode 100644 website/src/jade/blog/parsing-english-in-python/index.html
delete mode 100644 website/src/jade/blog/parsing-english-in-python/meta.html
delete mode 100644 website/src/jade/docs/_api.html
diff --git a/website/README.md b/website/README.md
index 755d1cba2..928c9591e 100644
--- a/website/README.md
+++ b/website/README.md
@@ -24,6 +24,6 @@ To make and test changes
```
npm install jade --global
fab web
- python -m SimpleHTTPServer 8000 website/site
+ cd website/site; python -m SimpleHTTPServer 8000; cd -
```
Then visit [localhost:8000](http://localhost:8000)
diff --git a/website/src/jade/404.html b/website/src/jade/404.html
deleted file mode 100644
index 253659d4f..000000000
--- a/website/src/jade/404.html
+++ /dev/null
@@ -1,67 +0,0 @@
-
-
-
- 404 | spaCy.io
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Imagine: you try to use Google Translate, but it asks you to first select which model you want. The new, awesome deep-learning model is there, but so are lots of others. You pick one that sounds fancy, but it turns out it's a 20-year old experimental model trained on a corpus of oven manuals. You are not interested in over manuals.
-
Of course, this is not how Google Translate operates. They make sure the model you use is good. This is what spaCy does, too. But most natural language understanding libraries, it's just not anybody's job to delete obsolete models. There's also a real reluctance to editorialize. Some advantage can be found for every model. Like, is it really fair to call that oven-specific model obsolete? In some ways we still have a lot to learn from its principled approach. And what if someone needs to translate an oven manual?
-
Have a look through the GATE software. There's a lot there, developed over 12 years and many person-hours. But there's approximately zero curation. The philosophy is just to provide things. It's up to you to decide what to use.
-
This is bad. It's bad to provide an implementation of MiniPar, and have it just...sit there, with no hint that it's 20 years old and should not be used. The RASP parser, too. Why are these provided? Worse, why is there no warning? Unless you want to investigate the history of the field, there's no reason to execute these programs in 2015.
-
Check out how Dekang Lin, the author of Minipar, presents the software – with reference to a benchmark on a Pentium II. This is the right way to archive the program. In this form its status is clear.
-
Various people have asked me why I decided to make a new Python NLP library, spaCy, instead of supporting the NLTK project. There are many things I dislike about the NLTK code-base, but the lack of curation is really my key complaint: the project simply doesn't throw anything away, and it refuses to call any technique or implementation good or bad.
-
In March NLTK announced the inclusion of a more up-to-date dependency parsing algorithm, based on the linear-time algorithm everyone is now using. There was some excitement about this, as this type of parser really should get much better accuracy than the other algorithms NLTK includes. But can you tell which of these parsers is the new one?
-
The best parser there – the new one – is called "transition parser". But it's still not actually good. Unfortunately, the NLTK implementation is based on Nivre's original 2003 paper, instead of using the recent research; and they use external, general-purpose machine learning libraries, instead of a simple custom implementation that would perform much better. Together these limitations mean the performance of the model is terrible, relative to the current state-of-the-art.
-
I happened to visit the NLTK issue tracker while they were discussing the transition-based parser, so I linked them to my post explaining how to implement this parser in 500 lines of Python. I got a "thanks but no thanks", and the issue was abruptly closed. Another researcher's offer from 2012 to implement this type of model also went unanswered.
-
An enormous amount of work has gone into, and is still going into, making NLTK an easily accessible way for computer science students to learn a little bit about linguistics, or for linguistics students to learn a little bit about computer science. I respect that work.
-
But nowhere does it say that if you want to really build something, or do up-to-date research, NLTK isn't for you. NLTK claims it can serve that use-case. But it can't. The implication is that if you use the models provided in NLTK, e.g. its chunker, tagger, dependency parser etc, these will be roughly equivalent to what you'll get elsewhere. But they're not. The gulf in quality is enormous. NLTK does not even know how its POS tagger was trained. The model is just this .pickle file that's been passed around for 5 years, its origins lost to time. This is not okay.
-
I think open source software should be very careful to make its limitations clear. It's a disservice to provide something that's much less useful than you imply. It's like offering your friend a lift and then not showing up. It's totally fine to not do something – so long as you never suggested you were going to do it. There are ways to do worse than nothing.
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/website/src/jade/blog/dead-code-should-be-buried/meta.html b/website/src/jade/blog/dead-code-should-be-buried/meta.html
deleted file mode 100644
index e69de29bb..000000000
diff --git a/website/src/jade/blog/parsing-english-in-python/index.html b/website/src/jade/blog/parsing-english-in-python/index.html
deleted file mode 100644
index 3d01894cc..000000000
--- a/website/src/jade/blog/parsing-english-in-python/index.html
+++ /dev/null
@@ -1,535 +0,0 @@
-
-
-
- Parsing English in 500 lines of Python | spaCy.io
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
2015-08-19 Update: I wrote this blog post in 2013, describing an exiciting advance in natural language understanding technology. Today, almost all high-performance parsers are using a variant of the algorithm described below (including spaCy). The original post is preserved below, with added commentary in light of recent research.
-
A syntactic parser describes a sentence’s grammatical structure, to help another application reason about it. Natural languages introduce many unexpected ambiguities, which our world-knowledge immediately filters out. A favourite example:
-
They ate the pizza with anchovies
-
-
A correct parse links “with” to “pizza”, while an incorrect parse links “with” to “eat”:
-
-
-
-
-
-
Prepositional phrase attachment is a common source of errors for statistical parsers.
-
-
The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.
-
Update: CoreNLP now features high-performance transition-based models. It is much faster than the Redshift parser (my research system), but less accurate. spaCy is faster again still, more accurate than CoreNLP, but less accurate than Redshift, due to spaCy's use of greedy search. It would be relatively easy to provide a beam-search version of spaCy...But, I think the gap in accuracy will continue to close, especially given advances in neural network learning.
The rest of the post sets up the problem, and then takes you through a concise implementation, prepared for this post. The first 200 lines of parser.py, the part-of-speech tagger and learner, are described here. You should probably at least skim that post before reading this one, unless you’re very familiar with NLP research.
-
The Cython system, Redshift, was written for my current research. I plan to improve it for general use in June, after my contract ends at Macquarie University. The current version is hosted on GitHub.
-
Problem Description
-
It’d be nice to type an instruction like this into your phone:
-
Set volume to zero when I’m in a meeting, unless John’s school calls.
-
And have it set the appropriate policy. On Android you can do this sort of thing with Tasker, but an NL interface would be much better. It’d be especially nice to receive a meaning representation you could edit, so you could see what it thinks you said, and correct it.
-
There are lots of problems to solve to make that work, but some sort of syntactic representation is definitely necessary. We need to know that:
-
Unless John’s school calls, when I’m in a meeting, set volume to zero
-
is another way of phrasing the first instruction, while:
-
Unless John’s school, call when I’m in a meeting
-
means something completely different.
-
A dependency parser returns a graph of word-word relationships, intended to make such reasoning easier. Our graphs will be trees – edges will be directed, and every node (word) will have exactly one incoming arc (one dependency, with its head), except one.
-
Example usage
-
parser = parser.Parser()
-tokens = "Set the volume to zero when I 'm in a meeting unless John 's school calls".split()
->>> tags, heads = parser.parse(tokens)
->>> heads
-[-1, 2, 0, 0, 3, 0, 7, 5, 7, 10, 8, 0, 13, 15, 15, 11]
->>> for i, h in enumerate(heads):
-... head = tokens[heads[h]] if h >= 1 else 'None'
-... print(tokens[i] + ' <-- ' + head])
-Set <-- None
-the <-- volume
-volume <-- Set
-to <-- Set
-zero <-- to
-when <-- Set
-I <-- 'm
-'m <-- when
-in <-- 'm
-a <-- meeting
-meeting <-- in
-unless <-- Set
-John <-- 's
-'s <-- calls
-school <-- calls
-calls <-- unless
-
The idea is that it should be slightly easier to reason from the parse, than it was from the string. The parse-to-meaning mapping is hopefully simpler than the string-to-meaning mapping.
-
The most confusing thing about this problem area is that “correctness” is defined by convention — by annotation guidelines. If you haven’t read the guidelines and you’re not a linguist, you can’t tell whether the parse is “wrong” or “right”, which makes the whole task feel weird and artificial.
-
For instance, there’s a mistake in the parse above: “John’s school calls” is structured wrongly, according to the Stanford annotation guidelines. The structure of that part of the sentence is how the annotators were instructed to parse an example like “John’s school clothes”.
-
It’s worth dwelling on this point a bit. We could, in theory, have written our guidelines so that the “correct” parses were reversed. There’s good reason to believe the parsing task will be harder if we reversed our convention, as it’d be less consistent with the rest of the grammar. [2] But we could test that empirically, and we’d be pleased to gain an advantage by reversing the policy.
-
We definitely do want that distinction in the guidelines — we don’t want both to receive the same structure, or our output will be less useful. The annotation guidelines strike a balance between what distinctions downstream applications will find useful, and what parsers will be able to predict easily.
-
Projective trees
-
There’s a particularly useful simplification that we can make, when deciding what we want the graph to look like: we can restrict the graph structures we’ll be dealing with. This doesn’t just give us a likely advantage in learnability; it can have deep algorithmic implications. We follow most work on English in constraining the dependency graphs to be projective trees:
-
-
Tree. Every word has exactly one head, except for the dummy ROOT symbol.
-
Projective. For every pair of dependencies (a1, a2) and (b1, b2), if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”. You can’t have a pair of dependencies that goes a1 b1 a2 b2, or b1 a1 b2 a2.
-
-
There’s a rich literature on parsing non-projective trees, and a smaller literature on parsing DAGs. But the parsing algorithm I’ll be explaining deals with projective trees.
-
Greedy transition-based parsing
-
Our parser takes as input a list of string tokens, and outputs a list of head indices, representing edges in the graph. If the ith member of heads is j, the dependency parse contains an edge (j, i). A transition-based parser is a finite-state transducer; it maps an array of N words onto an output array of N head indices:
-
-
-
-
start
-
MSNBC
-
reported
-
that
-
Facebook
-
bought
-
WhatsApp
-
for
-
$16bn
-
root
-
-
-
0
-
2
-
9
-
2
-
4
-
2
-
4
-
4
-
7
-
0
-
-
-
-
The heads array denotes that the head of MSNBC is reported:
- is word 1, and reported is word 2, and heads[1] == 2. You can already see why parsing a tree is handy — this data structure wouldn’t work if we had to output a DAG, where words may have multiple heads.
-
-
Although heads can be represented as an array, we’d actually like to maintain some alternate ways to access the parse, to make it easy and efficient to extract features. Our Parse class looks like this:
-
class Parse(object):
- def __init__(self, n):
- self.n = n
- self.heads = [None] * (n-1)
- self.lefts = []
- self.rights = []
- for i in range(n+1):
- self.lefts.append(DefaultList(0))
- self.rights.append(DefaultList(0))
-
- def add_arc(self, head, child):
- self.heads[child] = head
- if child < head:
- self.lefts[head].append(child)
- else:
- self.rights[head].append(child)
-
As well as the parse, we also have to keep track of where we’re up to in the sentence. We’ll do this with an index into the words array, and a stack, to which we’ll push words, before popping them once their head is set. So our state data structure is fundamentally:
-
-
An index, i, into the list of tokens;
-
The dependencies added so far, in Parse
-
A stack, containing words that occurred before i, for which we’re yet to assign a head.
-
-
Each step of the parsing process applies one of three actions to the state:
-
SHIFT = 0; RIGHT = 1; LEFT = 2
-MOVES = [SHIFT, RIGHT, LEFT]
-
-def transition(move, i, stack, parse):
- global SHIFT, RIGHT, LEFT
- if move == SHIFT:
- stack.append(i)
- return i + 1
- elif move == RIGHT:
- parse.add_arc(stack[-2], stack.pop())
- return i
- elif move == LEFT:
- parse.add_arc(i, stack.pop())
- return i
- raise GrammarError("Unknown move: %d" % move)
-
The LEFT and RIGHT actions add dependencies and pop the stack, while SHIFT pushes the stack and advances i into the buffer.
-
So, the parser starts with an empty stack, and a buffer index at 0, with no dependencies recorded. It chooses one of the (valid) actions, and applies it to the state. It continues choosing actions and applying them until the stack is empty and the buffer index is at the end of the input. (It’s hard to understand this sort of algorithm without stepping through it. Try coming up with a sentence, drawing a projective parse tree over it, and then try to reach the parse tree by choosing the right sequence of transitions.)
-
Here’s what the parsing loop looks like in code:
-
class Parser(object):
- ...
- def parse(self, words):
- tags = self.tagger(words)
- n = len(words)
- idx = 1
- stack = [0]
- deps = Parse(n)
- while stack or idx < n:
- features = extract_features(words, tags, idx, n, stack, deps)
- scores = self.model.score(features)
- valid_moves = get_valid_moves(i, n, len(stack))
- next_move = max(valid_moves, key=lambda move: scores[move])
- idx = transition(next_move, idx, stack, parse)
- return tags, parse
-
-def get_valid_moves(i, n, stack_depth):
- moves = []
- if i < n:
- moves.append(SHIFT)
- if stack_depth <= 2:
- moves.append(RIGHT)
- if stack_depth <= 1:
- moves.append(LEFT)
- return moves
-
We start by tagging the sentence, and initializing the state. We then map the state to a set of features, which we score using a linear model. We then find the best-scoring valid move, and apply it to the state.
-
The model scoring works the same as it did in the POS tagger. If you’re confused about the idea of extracting features and scoring them with a linear model, you should review that post. Here’s a reminder of how the model scoring works:
-
class Perceptron(object)
- ...
- def score(self, features):
- all_weights = self.weights
- scores = dict((clas, 0) for clas in self.classes)
- for feat, value in features.items():
- if value == 0:
- continue
- if feat not in all_weights:
- continue
- weights = all_weights[feat]
- for clas, weight in weights.items():
- scores[clas] += value * weight
- return scores
-
It’s just summing the class-weights for each feature. This is often expressed as a dot-product, but when you’re dealing with multiple classes, that gets awkward, I find.
-
The beam parser (RedShift) tracks multiple candidates, and only decides on the best one at the very end. We’re going to trade away accuracy in favour of efficiency and simplicity. We’ll only follow a single analysis. Our search strategy will be entirely greedy, as it was with the POS tagger. We’ll lock-in our choices at every step.
-
If you read the POS tagger post carefully, you might see the underlying similarity. What we’ve done is mapped the parsing problem onto a sequence-labelling problem, which we address using a “flat”, or unstructured, learning algorithm (by doing greedy search).
-
Features
-
Feature extraction code is always pretty ugly. The features for the parser refer to a few tokens from the context:
-
-
The first three words of the buffer (n0, n1, n2)
-
The top three words of the stack (s0, s1, s2)
-
The two leftmost children of s0 (s0b1, s0b2);
-
The two rightmost children of s0 (s0f1, s0f2);
-
The two leftmost children of n0 (n0b1, n0b2)
-
-
For these 12 tokens, we refer to the word-form, the part-of-speech tag, and the number of left and right children attached to the token.
-
Because we’re using a linear model, we have our features refer to pairs and triples of these atomic properties.
-
def extract_features(words, tags, n0, n, stack, parse):
- def get_stack_context(depth, stack, data):
- if depth >= 3:
- return data[stack[-1]], data[stack[-2]], data[stack[-3]]
- elif depth >= 2:
- return data[stack[-1]], data[stack[-2]], ''
- elif depth == 1:
- return data[stack[-1]], '', ''
- else:
- return '', '', ''
-
- def get_buffer_context(i, n, data):
- if i + 1 >= n:
- return data[i], '', ''
- elif i + 2 >= n:
- return data[i], data[i + 1], ''
- else:
- return data[i], data[i + 1], data[i + 2]
-
- def get_parse_context(word, deps, data):
- if word == -1:
- return 0, '', ''
- deps = deps[word]
- valency = len(deps)
- if not valency:
- return 0, '', ''
- elif valency == 1:
- return 1, data[deps[-1]], ''
- else:
- return valency, data[deps[-1]], data[deps[-2]]
-
- features = {}
- # Set up the context pieces --- the word, W, and tag, T, of:
- # S0-2: Top three words on the stack
- # N0-2: First three words of the buffer
- # n0b1, n0b2: Two leftmost children of the first word of the buffer
- # s0b1, s0b2: Two leftmost children of the top word of the stack
- # s0f1, s0f2: Two rightmost children of the top word of the stack
-
- depth = len(stack)
- s0 = stack[-1] if depth else -1
-
- Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words)
- Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags)
-
- Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words)
- Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags)
-
- Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words)
- Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags)
-
- Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words)
- _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags)
-
- Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words)
- _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags)
-
- Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words)
- _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags)
-
- # Cap numeric features at 5?
- # String-distance
- Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0
-
- features['bias'] = 1
- # Add word and tag unigrams
- for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2):
- if w:
- features['w=%s' % w] = 1
- for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2):
- if t:
- features['t=%s' % t] = 1
-
- # Add word/tag pairs
- for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))):
- if w or t:
- features['%d w=%s, t=%s' % (i, w, t)] = 1
-
- # Add some bigrams
- features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1
- features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1
- features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1
- features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1
- features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1
- features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1
- features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1
- features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1
-
- # Add some tag trigrams
- trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0),
- (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1),
- (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2),
- (Ts0, Ts1, Ts1))
- for i, (t1, t2, t3) in enumerate(trigrams):
- if t1 or t2 or t3:
- features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1
-
- # Add some valency and distance features
- vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b))
- vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b))
- d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0),
- ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0))
- for i, (w_t, v_d) in enumerate(vw + vt + d):
- if w_t or v_d:
- features['val/d-%d %s %d' % (i, w_t, v_d)] = 1
- return features
-
Training
-
Weights are learned using the same algorithm, averaged perceptron, that we used for part-of-speech tagging. Its key strength is that it’s an online learning algorithm: examples stream in one-by-one, we make our prediction, check the actual answer, and adjust our beliefs (weights) if we were wrong.
-
The training loop looks like this:
-
class Parser(object):
- ...
- def train_one(self, itn, words, gold_tags, gold_heads):
- n = len(words)
- i = 2; stack = [1]; parse = Parse(n)
- tags = self.tagger.tag(words)
- while stack or (i + 1) < n:
- features = extract_features(words, tags, i, n, stack, parse)
- scores = self.model.score(features)
- valid_moves = get_valid_moves(i, n, len(stack))
- guess = max(valid_moves, key=lambda move: scores[move])
- gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads)
- best = max(gold_moves, key=lambda move: scores[move])
- self.model.update(best, guess, features)
- i = transition(guess, i, stack, parse)
- # Return number correct
- return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]])
-
The most interesting part of the training process is in get_gold_moves. The performance of our parser is made possible by an advance by Goldberg and Nivre (2012), who showed that we’d been doing this wrong for years.
-
2015-08-19 Update: Interesting, CoreNLP continues to "do it wrong" – their transition-based parser uses the static-oracle, rather than the dynamic oracle described here. I attribute spaCy's accuracy advantage to this difference in training algorithm. The ClearNLP parser uses an iterative algorithm that achieves the same sort of thing (and was published prior to the dynamic oracle). I find the dynamic oracle idea much more conceptually clear.
-
In the POS-tagging post, I cautioned that during training you need to make sure you pass in the last two predicted tags as features for the current tag, not the last two gold tags. At test time you’ll only have the predicted tags, so if you base your features on the gold sequence during training, your training contexts won’t resemble your test-time contexts, so you’ll learn the wrong weights.
-
In parsing, the problem was that we didn’t know how to pass in the predicted sequence! Training worked by taking the gold-standard tree, and finding a transition sequence that led to it. i.e., you got back a sequence of moves, with the guarantee that if you followed those moves, you’d get the gold-standard dependencies.
-
The problem is, we didn’t know how to define the “correct” move to teach a parser to make if it was in any state that wasn’t along that gold-standard sequence. Once the parser had made a mistake, we didn’t know how to train from that example.
-
That was a big problem, because it meant that once the parser started making mistakes, it would end up in states unlike any in its training data – leading to yet more mistakes. The problem was specific to greedy parsers: once you use a beam, there’s a natural way to do structured prediction.
-
2015-08-19 Update: It's since been pointed out to me that what we're calling a "dynamic oracle" here is really a form of imitation learning.
-
The solution seems obvious once you know it, like all the best breakthroughs. What we do is define a function that asks “How many gold-standard dependencies can be recovered from this state?”. If you can define that function, then you can apply each move in turn, and ask, “How many gold-standard dependencies can be recovered from this state?”. If the action you applied allows fewer gold-standard dependencies to be reached, then it is sub-optimal.
We also have a set of actions, each of which returns a new state. We want to know:
-
-
shift_cost = Oracle(state) – Oracle(shift(state))
-
right_cost = Oracle(state) – Oracle(right(state))
-
left_cost = Oracle(state) – Oracle(left(state))
-
-
Now, at least one of those costs has to be zero. Oracle(state) is asking, “what’s the cost of the best path forward?”, and the first action of that best path has to be shift, right, or left.
-
It turns out that we can derive Oracle fairly simply for many transition systems. The derivation for the transition system we’re using, Arc Hybrid, is in Goldberg and Nivre (2013).
-
We’re going to implement the oracle as a function that returns the zero-cost moves, rather than implementing a function Oracle(state). This prevents us from doing a bunch of costly copy operations. Hopefully the reasoning in the code isn’t too hard to follow, but you can also consult Goldberg and Nivre’s papers if you’re confused and want to get to the bottom of this.
-
def get_gold_moves(n0, n, stack, heads, gold):
- def deps_between(target, others, gold):
- for word in others:
- if gold[word] == target or gold[target] == word:
- return True
- return False
-
- valid = get_valid_moves(n0, n, len(stack))
- if not stack or (SHIFT in valid and gold[n0] == stack[-1]):
- return [SHIFT]
- if gold[stack[-1]] == n0:
- return [LEFT]
- costly = set([m for m in MOVES if m not in valid])
- # If the word behind s0 is its gold head, Left is incorrect
- if len(stack) >= 2 and gold[stack[-1]] == stack[-2]:
- costly.add(LEFT)
- # If there are any dependencies between n0 and the stack,
- # pushing n0 will lose them.
- if SHIFT not in costly and deps_between(n0, stack, gold):
- costly.add(SHIFT)
- # If there are any dependencies between s0 and the buffer, popping
- # s0 will lose them.
- if deps_between(stack[-1], range(n0+1, n-1), gold):
- costly.add(LEFT)
- costly.add(RIGHT)
- return [m for m in MOVES if m not in costly]
-
Doing this “dynamic oracle” training procedure makes a big difference to accuracy — typically 1-2%, with no difference to the way the run-time works. The old “static oracle” greedy training procedure is fully obsolete; there’s no reason to do it that way any more.
-
Conclusion
-
I have the sense that language technologies, particularly those relating to grammar, are particularly mysterious. I can imagine having no idea what the program might even do.
-
I think it therefore seems natural to people that the best solutions would be over-whelmingly complicated. A 200,000 line Java package feels appropriate.
-
But, algorithmic code is usually short, when only a single algorithm is implemented. And when you only implement one algorithm, and you know exactly what you want to write before you write a line, you also don’t pay for any unnecessary abstractions, which can have a big performance impact.
-
Notes
-
[1] I wasn’t really sure how to count the lines of code in the Stanford parser. Its jar file ships over 200k, but there are a lot of different models in it. It’s not important, but it's certainly over 4k.
-
[2] For instance, how would you parse, “John’s school of music calls”? You want to make sure the phrase “John’s school” has a consistent structure in both “John’s school calls” and “John’s school of music calls”. Reasoning about the different “slots” you can put a phrase into is a key way we reason about what syntactic analyses look like. You can think of each phrase as having a different shaped connector, which you need to plug into different slots — which each phrase also has a certain number of, each of a different shape. We’re trying to figure out what connectors are where, so we can figure out how the sentences are put together.
-
Idle speculation
-
For a long time, incremental language processing algorithms were primarily of scientific interest. If you want to write a parser to test a theory about how the human sentence processor might work, well, that parser needs to build partial interpretations. There’s a wealth of evidence, including commonsense introspection, that establishes that we don’t buffer input and analyse it once the speaker has finished.
-
But now algorithms with that neat scientific feature are winning! As best as I can tell, the secret to that success is to be:
-
-
Incremental. Earlier words constrain the search.
-
Error-driven. Training involves a working hypothesis, which is updated as it makes mistakes.
-
-
The links to human sentence processing seem tantalising. I look forward to seeing whether these engineering breakthroughs lead to any psycholinguistic advances.
-
Bibliography
-
The NLP literature is almost entirely open access. All of the relavant papers can be found here.
-
The parser I’ve described is an implementation of the dynamic-oracle Arc-Hybrid system here:Goldberg, Yoav; Nivre, Joakim. Training Deterministic Parsers with Non-Deterministic Oracles. TACL 2013
-
However, I wrote my own features for it. The arc-hybrid system was originally described here:Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic programming algorithms for transition-based dependency parsers. ACL 2011
-
The dynamic oracle training method was first described here:A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav; Nivre, Joakim. COLING 2012
-
This work depended on a big break-through in accuracy for transition-based parsers, when beam-search was properly explored by Zhang and Clark. They have several papers, but the preferred citation is:Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized Perceptron and Beam Search. Computational Linguistics 2011 (1)
-
Another important paper was this little feature engineering paper, which further improved the accuracy:Zhang, Yue; Nivre, Joakim. Transition-based Dependency Parsing with Rich Non-local Features. ACL 2011
-
The generalised perceptron, which is the learning framework for these beam parsers, is from this paper:Collins, Michael. Discriminative Training Methods for Hidden Markov Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002
-
Experimental details
-
The results at the start of the post refer to Section 22 of the Wall Street Journal corpus. The Stanford parser was run as follows:
I can’t easily read that anymore, but it should just convert every .mrg file in a folder to a CoNLL-format Stanford basic dependencies file, using the settings common in the dependency literature.
-
I then converted the gold-standard trees from WSJ 22, for the evaluation. Accuracy scores refer to unlabelled attachment score (i.e. the head index) of all non-punctuation tokens.
-
To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21 into the same conversion script.
-
In a nutshell: The Stanford model and parser.py are trained on the same set of sentences, and they each make their predictions on a held-out test set, for which we know the answers. Accuracy refers to how many of the words’ heads we got correct.
-
Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a server, to give the Stanford parser more memory. The parser.py system runs fine on my MacBook Air. I used PyPy for the parser.py experiments; CPython was about half as fast on an early benchmark.
-
One of the reasons parser.py is so fast is that it does unlabelled parsing. Based on previous experiments, a labelled parser would likely be about 40x slower, and about 1% more accurate. Adapting the program to labelled parsing would be a good exercise for the reader, if you have access to the data.
-
The result from the Redshift parser was produced from commit b6b624c9900f3bf, which was run as follows:
Load models into a callable object to process English text. Intended use is for one instance to be created per process. You can create more if you're doing something unusual. You may wish to make the instance a global variable or "singleton". We usually instantiate the object in the main() function and pass it around as an explicit argument.
-
from spacy.en import English
-from spacy._doc_examples import download_war_and_peace
-
-unprocessed_unicode = download_war_and_peace()
-
-nlp = English()
-doc = nlp(unprocessed_unicode)
data_dir –
-
- The data directory. May be , to disable any data loading (including the vocabulary).
-
-
Tagger – A class/function that creates the part-of-speech tagger. Usually this is left True, to load the default tagger. If falsey, no tagger is loaded.
-
You can also supply your own class/function, which will be called once on setup. The returned function will then be called in English.__call__. The function passed must accept two arguments, of types (StringStore, directory), and produce a function that accepts one argument, of type Doc. Its return type is unimportant.
-
-
Parser – A class/function that creates the syntactic dependency parser. Usually this is left True, to load the default tagger. If falsey, no parser is loaded.
-
You can also supply your own class/function, which will be called once on setup. The returned function will then be called in English.__call__. The function passed must accept two arguments, of types (StringStore, directory), and produce a function that accepts one argument, of type Doc. Its return type is unimportant.
-
-
Entity – A class/function that creates the named entity recogniser. Usually this is left True, to load the default tagger. If falsey, no entity recognizer is loaded.
-
You can also supply your own class/function, which will be called once on setup. The returned function will then be called in English.__call__. The function passed must accept two arguments, of types (StringStore, directory), and produce a function that accepts one argument, of type Doc. Its return type is unimportant.
-
-
load_vectors –
-
- A boolean value to control whether the word vectors are loaded.
-
The main entry point to spaCy. Takes raw unicode text, and returns a Doc object, which can be iterated to access Token and Span objects. spaCy's models are all linear-time, so you can supply documents of arbitrary length, e.g. whole novels.
-
-
text (unicode) –The text to be processed. spaCy expects raw unicode txt – you don't necessarily need to, say, split it into paragraphs. However, depending on your documents, you might be better off applying custom pre-processing. Non-text formatting, e.g. from HTML mark-up, should be removed before sending the document to spaCy. If your documents have a consistent format, you may be able to improve accuracy by pre-processing. For instance, if the first word of your documents are always in upper-case, it may be helpful to normalize them before supplying them to spaCy.
-
-
tag (bool) –Whether to apply the part-of-speech tagger. Required for parsing and entity recognition.
-
-
parse (bool) – Whether to apply the syntactic dependency parser.
-
-
entity (bool) –Whether to apply the named entity recognizer.
-
-
-
from spacy.en import English
-nlp = English()
-doc = nlp(u'Some text.) # Applies tagger, parser, entity
-doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
-doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
-doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
-doc = nlp(u'') # Zero-length tokens, not an error
-# doc = nlp(b'Some text') <-- Error: need unicode
-doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
A sequence of Token objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary strings.
-
Internally, the Doc object holds an array of TokenC structs. The Python-level Token and Span objects are views of this array, i.e. they don't own the data themselves. This details of the internals shouldn't matter for the API – but it may help you read the code, and understand how spaCy is designed.
vocab – A Vocabulary object, which must match any models you want to use (e.g. tokenizer, parser, entity recognizer).
-
-
orth_and_spaces – A list of (orth_id, has_space) tuples, where orth_id is an integer, and has_space is a boolean, indicating whether the token has a trailing space.
-
-
-
-
-
-
-
Sequence API
-
-
doc[i] Get the Token object at position i, where i is an integer. Negative indexing is supported, and follows the usual Python semantics, i.e. doc[-2] is doc[len(doc) - 2].
-
-
doc[start : end] Get a Span object, starting at position start and ending at position end. For instance, doc[2:5] produces a span consisting of tokens 2, 3 and 4. Stepped slices (e.g. doc[start : end : step]) are not supported, as Span objects must be contiguous (cannot have gaps).
-
-
for token in docIterate over Token objects, from which the annotations can be easily accessed. This is the main way of accessing Token objects, which are the main way annotations are accessed from Python. If faster-than-Python speeds are required, you can instead access the annotations as a numpy array, or access the underlying C data directly from Cython, via Doc.data, an array of TokenC structs. The C API has not yet been finalized, and is subject to change.
-
-
len(doc) The number of tokens in the document.
-
-
-
-
-
Sentence, entity and noun chunk spans
-
-
- sents
-
Yields sentence Span objects. Iterate over the span to get individual Token objects. Sentence spans have no label.
-
>>> from spacy.en import English
->>> nlp = English()
->>> doc = nlp(u'This is a sentence. Here's another...')
->>> for sentence in doc.sents:
-... sentence.root.orth_
-is
-'s
-
-
-
- ents
-
Yields named-entity Span objects. Iterate over the span to get individual Token objects, or access the label:
-
>>> from spacy.en import English
->>> nlp = English()
->>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
->>> ents = list(tokens.ents)
->>> ents[0].label, ents[0].label_, ents[0].orth_, ents[0].string
-(112504, 'PERSON', 'Best', ents[0].string)
-
-
-
- noun_chunks
-
Yields base noun-phrase Span objects. A base noun phrase, or "NP chunk", is a noun phrase that does not permit other NPs to be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. For example:
-
>>> from spacy.en import English
->>> nlp = English()
->>> doc = nlp('The sentence in this example has three noun chunks.')
->>> for chunk in doc.noun_chunks:
-... print(chunk.label, chunk.orth_, '<--', chunk.root.head.orth_)
-NP The sentence <-- has
-NP this example <-- in
-NP three noun chunks <-- has
-
-
-
-
-
-
Export/Import
-
-
- to_arrayattr_idsGiven a list of M attribute IDs, export the tokens to a numpy ndarray of shape N*M, where N is the length of the sentence.
-
-
attr_ids (list[int]) –A list of attribute ID ints. Attribute IDs can be imported from spacy.attrs
-
-
-
-
- count_byattr_idProduce a dict of {attribute (int): count (ints)} frequencies, keyed by the values of the given attribute ID.
-
-
-
- from_arrayattrs, arrayWrite to a Doc object, from an M*N array of attributes.
-
-
- from_bytesDeserialize, loading from bytes.
-
-
- to_bytesSerialize, producing a byte string.
-
-
- read_bytesclassmethod
-
-
-
-
- classTokenA Token represents a single word, punctuation or significant whitespace symbol. Integer IDs are provided for all string features. The (unicode) string is provided by an attribute of the same name followed by an underscore, e.g. token.orth is an integer ID, token.orth_ is the unicode value. The only exception is the Token.string attribute, which is (unicode) string-typed.
-
-
-
String Features
-
-
-
lemma / lemma_The "base" of the word, with no inflectional suffixes, e.g. the lemma of "developing" is "develop", the lemma of "geese" is "goose", etc. Note that derivational suffixes are not stripped, e.g. the lemma of "instutitions" is "institution", not "institute". Lemmatization is performed using the WordNet data, but extended to also cover closed-class words such as pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his". We assign pronouns the lemma -PRON-.
-
-
-
-
orth / orth_The form of the word with no string normalization or processing, as it appears in the string, without trailing whitespace.
-
-
lower / lower_The form of the word, but forced to lower-case, i.e. lower = word.orth_.lower()
-
-
shape / shape_A transform of the word's string, to show orthographic features. The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. After these mappings, sequences of 4 or more of the same character are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, :) --> :)
-
-
prefix / prefix_A length-N substring from the start of the word. Length may vary by language; currently for English n=1, i.e. prefix = word.orth_[:1]
-
-
suffix / suffix_A length-N substring from the end of the word. Length may vary by language; currently for English n=3, i.e. suffix = word.orth_[-3:]
-
-
-
-
-
-
Boolean Flags
-
-
-
is_alpha Equivalent to word.orth_.isalpha()
-
-
is_ascii Equivalent to any(ord(c) >= 128 for c in word.orth_)
-
-
is_digit Equivalent to word.orth_.isdigit()
-
-
is_lower Equivalent to word.orth_.islower()
-
-
is_title Equivalent to word.orth_.istitle()
-
-
is_punct Equivalent to word.orth_.ispunct()
-
-
is_space Equivalent to word.orth_.isspace()
-
-
like_url Does the word resembles a URL?
-
-
like_num Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc
-
-
like_email Does the word resemble an email?
-
-
is_oov Is the word out-of-vocabulary?
-
-
-
- check_flagflag_idGet the value of one of the boolean flags
-
-
-
-
-
Distributional Features
-
-
-
prob The unigram log-probability of the word, estimated from counts from a large corpus, smoothed using Simple Good Turing estimation.
-
-
cluster The Brown cluster ID of the word. These are often useful features for linear models. If you’re using a non-linear model, particularly a neural net or random forest, consider using the real-valued word representation vector, in Token.repvec, instead.
-
-
repvec A “word embedding” representation: a dense real-valued vector that supports similarity queries between words. By default, spaCy currently loads vectors produced by the Levy and Goldberg (2014) dependency-based word2vec model.
-
-
-
-
-
-
Alignment and Output
-
-
-
idxStart index of the token in the string
-
-
len(token)Length of the token's orth string, in unicode code-points.
-
-
unicode(token)Same as token.orth_
-
-
str(token)In Python 3, returns token.orth_. In Python 2, returnstoken.orth_.encode('utf8')
-
-
stringtoken.orth_ + token.whitespace_, i.e. the form of the word as it appears in the string,
- trailing whitespace. This is useful when you need to use linguistic features to add inline mark-up to the string.
-
-
whitespace_The number of immediate syntactic children following the word in the string.
-
-
-
-
-
Navigating the Parse Tree
-
-
headThe immediate syntactic head of the token. If the token is the root of its sentence, it is the token itself, i.e. root_token.head is root_token
-
-
childrenAn iterator that yields from lefts, and then yields from rights.
-
-
subtreeAn iterator for the part of the sentence syntactically governed by the word, including the word itself.
-
-
left_edgeThe leftmost edge of the token's subtree
-
-
right_edgeThe rightmost edge of the token's subtree
-
-
-
- nbor(i=1)Get the ith next / previous neighboring token.
-
-
-
-
-
Named Entities
-
-
-
ent_typeIf the token is part of an entity, its entity type.
-
-
ent_iobThe IOB (inside, outside, begin) entity recognition tag for the token.
-
offset (int) –The index of the token within the document
-
-
-
-
-
-
-
-
- classSpanA Span is a slice of a Doc object, consisting of zero or more tokens. Spans are used to represent sentences, named entities, phrases, and arbitrary contiguous slices from the Doc object. Span objects are views – that is, they do not copy the underlying C data. This makes them cheap to construct, as internally are simply a reference to the Doc object, a start position, an end position, and a label ID.
-
token = span[i]Get the Token object at position i, where i is an offset within the Span, not the document. That is:
-
leftsTokens that are to the left of the span, whose head is within the span, i.e.
- lefts = [span.doc[i] for i in range(0, span.start)
- if span.doc[i].head in span]
-
-
rightsTokens that are to the right of the span, whose head is within the span, i.e.
-
rights = [span.doc[i] for i in range(span.end, len(span.doc))
- if span.doc[i].head in span]
-
-
-
subtreeTokens in the range (start, end+1), where start is the index of the leftmost word descended from a token in the span, and end is the index of the rightmost token descended from a token in the span.
-
The Lexeme object represents a lexical type, stored in the vocabulary – as opposed to a token, occurring in a document.
-
Lexemes store various features, so that these features can be computed once per type, rather than once per token. As job sizes grow, this can amount to a substantial efficiency improvement.
-
All Lexeme attributes are therefore context independent, as a single lexeme is reused for all usages of that word. Lexemes are keyed by the “orth” attribute.
-
All Lexeme attributes are accessible directly on the Token object.
-
-
-
String Features
-
-
-
orth / orth_The form of the word with no string normalization or processing, as it appears in the string, without trailing whitespace.
-
-
lower / lower_The form of the word, but forced to lower-case, i.e. lower = word.orth_.lower()
-
-
shape / shape_A transform of the word's string, to show orthographic features. The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. After these mappings, sequences of 4 or more of the same character are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, :) --> :)
-
-
prefix / prefix_A length-N substring from the start of the word. Length may vary by language; currently for English n=1, i.e. prefix = word.orth_[:1]
-
-
suffix / suffix_A length-N substring from the end of the word. Length may vary by language; currently for English n=3, i.e. suffix = word.orth_[-3:]
-
-
-
-
-
-
Boolean Features
-
-
-
is_alpha Equivalent to word.orth_.isalpha()
-
-
is_ascii Equivalent to any(ord(c) >= 128 for c in word.orth_)
-
-
is_digit Equivalent to word.orth_.isdigit()
-
-
is_lower Equivalent to word.orth_.islower()
-
-
is_title Equivalent to word.orth_.istitle()
-
-
is_punct Equivalent to word.orth_.ispunct()
-
-
is_space Equivalent to word.orth_.isspace()
-
-
like_url Does the word resembles a URL?
-
-
like_num Does the word represent a number? e.g. “10.9”, “10”, “ten”, etc
-
-
like_email Does the word resemble an email?
-
-
is_oov Is the word out-of-vocabulary?
-
-
-
-
-
-
Distributional Features
-
-
-
prob The unigram log-probability of the word, estimated from counts from a large corpus, smoothed using Simple Good Turing estimation.
-
-
cluster The Brown cluster ID of the word. These are often useful features for linear models. If you’re using a non-linear model, particularly a neural net or random forest, consider using the real-valued word representation vector, in Token.repvec, instead.
-
-
repvec A “word embedding” representation: a dense real-valued vector that supports similarity queries between words. By default, spaCy currently loads vectors produced by the Levy and Goldberg (2014) dependency-based word2vec model.
-
Intern strings, and map them to sequential integer IDs. The mapping table is very efficient , and a small-string optimization is used to maintain a small memory footprint. Only the integer IDs are held by spaCy's data classes (Doc, Token, Span and Lexeme) – when you use a string-valued attribute like token.orth_, you access a property that computes token.strings[token.orth].
-
-
string = string_store[int_id]Retrieve a string from a given integer ID. If the integer ID is not found, raise IndexError
-
-
int_id = string_store[unicode_string] Map a unicode string to an integer ID. If the string is previously unseen, it is interned, and a new ID is returned.
-
-
int_id = string_store[utf8_byte_string] Byte strings are assumed to be in UTF-8 encoding. Strings encoded with other codecs may fail silently. Given a utf8 string, the behaviour is the same as for unicode strings. Internally, strings are stored in UTF-8 format. So if you start with a UTF-8 byte string, it's less efficient to first decode it as unicode, as StringStore will then have to encode it as UTF-8 once again.
-
-
n_strings = len(string_store)Number of strings in the string-store
-
-
for string in string_storeIterate over strings in the string store, in order, such that the ith string in the sequence has the ID i:
-
for i, string in enumerate(string_store):
- assert i == string_store[string]
-
-
-
-
-
Constructors
-
-
StringStore.__init__ takes no arguments, so a new instance can be constructed as follows:
-
string_store = StringStore()
-
However, in practice you'll usually use the instance owned by the language's vocab object, which all classes hold a reference to:
-
-
english.vocab.strings
-
doc.vocab.strings
-
span.vocab.strings
-
token.vocab.strings
-
lexeme.vocab.strings
-
-
If you create another instance, it will map strings to different integers – which is usually not what you want.
Save the strings mapping to the given location, in plain text. The format is subject to change; so if you need to read/write compatible files, please can find details in the strings.pyx source.
Load the strings mapping from a plain-text file in the given location. The format is subject to change; so if you need to read/write compatible files, please can find details in the strings.pyx source.
-
-
-
\ No newline at end of file
diff --git a/website/src/jade/docs/_api.jade b/website/src/jade/docs/_api.jade
index ffcd39dd5..4db74cc9b 100644
--- a/website/src/jade/docs/_api.jade
+++ b/website/src/jade/docs/_api.jade
@@ -87,10 +87,9 @@ mixin SeeAlso(name, link_target)
mixin Define(term)
- li
- #[span.declaration #[code #{term}]]
+ li #[span.declaration #[code #{term}]]
block
-
+
mixin LexemeBooleans()
diff --git a/website/src/jade/docs/_spec.jade b/website/src/jade/docs/_spec.jade
index ba13ef995..08a31e810 100644
--- a/website/src/jade/docs/_spec.jade
+++ b/website/src/jade/docs/_spec.jade
@@ -11,8 +11,7 @@ mixin row(...cells)
mixin Define(term)
- li
- #[code #{term}]
+ li #[code #{term}]
block
diff --git a/website/src/jade/home/_installation.jade b/website/src/jade/home/_installation.jade
index b325c2a2f..2454fb9d3 100644
--- a/website/src/jade/home/_installation.jade
+++ b/website/src/jade/home/_installation.jade
@@ -7,10 +7,9 @@ mixin Option(name, open)
+Option("Updating your installation")
| To update your installation:
- pre.language-bash
- code
- $ pip install --upgrade spacy
- $ python -m spacy.en.download --force all
+ pre.language-bash: code
+ | $ pip install --upgrade spacy
+ | $ python -m spacy.en.download --force all
p Most updates ship a new model, so you will usually have to redownload the data.
From 8abe2a6918732a19855379e658798f8d47b3d313 Mon Sep 17 00:00:00 2001
From: Hugo
Date: Wed, 25 Nov 2015 08:58:05 +0200
Subject: [PATCH 08/13] Python 3.5 is out
---
.travis.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/.travis.yml b/.travis.yml
index fc2441e3e..c90da84d2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,6 +6,7 @@ os:
python:
- "2.7"
- "3.4"
+ - "3.5"
# install dependencies
install:
From f89665a0d09aeafdc98cdf5f5a3ad64099b725d4 Mon Sep 17 00:00:00 2001
From: Henning Peters
Date: Thu, 26 Nov 2015 09:15:43 +0100
Subject: [PATCH 09/13] add missing tutorials
---
fabfile.py | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/fabfile.py b/fabfile.py
index 8af52f135..96d8eed01 100644
--- a/fabfile.py
+++ b/fabfile.py
@@ -75,15 +75,16 @@ def web():
jade('blog/index.jade', 'blog/')
jade('tutorials/index.jade', 'tutorials/')
- for post_dir in (Path(__file__).parent / 'website' / 'src' / 'jade' / 'blog').iterdir():
- if post_dir.is_dir() \
- and (post_dir / 'index.jade').exists() \
- and (post_dir / 'meta.jade').exists():
- jade(str(post_dir / 'index.jade'), path.join('blog', post_dir.parts[-1]))
+ for collection in ('blog', 'tutorials'):
+ for post_dir in (Path(__file__).parent / 'website' / 'src' / 'jade' / collection).iterdir():
+ if post_dir.is_dir() \
+ and (post_dir / 'index.jade').exists() \
+ and (post_dir / 'meta.jade').exists():
+ jade(str(post_dir / 'index.jade'), path.join(collection, post_dir.parts[-1]))
def web_publish(assets_path):
- local('aws s3 sync --delete website/site/ s3://spacy.io')
+ local('aws s3 sync --delete --exclude "resources/*" website/site/ s3://spacy.io')
local('aws s3 sync --delete %s s3://spacy.io/resources' % assets_path)
From ce1a1451c5fc83d2eb3cb979806be90cd7be1ef8 Mon Sep 17 00:00:00 2001
From: Henning Peters
Date: Thu, 26 Nov 2015 09:27:04 +0100
Subject: [PATCH 10/13] add 404 page, remove old license links
---
fabfile.py | 1 +
website/src/jade/blog/introducing-spacy/index.jade | 2 +-
website/src/jade/header.jade | 1 -
website/src/jade/tutorials/add-a-language/index.jade | 4 ++--
4 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/fabfile.py b/fabfile.py
index 96d8eed01..6ba1df406 100644
--- a/fabfile.py
+++ b/fabfile.py
@@ -70,6 +70,7 @@ def web():
with virtualenv(VENV_DIR):
local('./website/create_code_samples spacy/tests/website/ website/src/code/')
+ jade('404.jade', '')
jade('home/index.jade', '')
jade('docs/index.jade', 'docs/')
jade('blog/index.jade', 'blog/')
diff --git a/website/src/jade/blog/introducing-spacy/index.jade b/website/src/jade/blog/introducing-spacy/index.jade
index c9f8834ef..a7f45aab3 100644
--- a/website/src/jade/blog/introducing-spacy/index.jade
+++ b/website/src/jade/blog/introducing-spacy/index.jade
@@ -19,4 +19,4 @@ include ./meta.jade
+TweetThis("Computers don't understand text. This is unfortunate, because that's what the web is mostly made of.", Meta.url)
p If none of that made any sense to you, here's the gist of it. Computers don't understand text. This is unfortunate, because that's what the web almost entirely consists of. We want to recommend people text based on other text they liked. We want to shorten text to display it on a mobile screen. We want to aggregate it, link it, filter it, categorise it, generate it and correct it.
- p spaCy provides a library of utility functions that help programmers build such products. It's commercial open source software: you can either use it under the AGPL, or you can #[a(href="/license") buy a commercial license] under generous terms.
+ p spaCy provides a library of utility functions that help programmers build such products. It's commercial open source software: you can either use it under the AGPL, or you can buy a commercial license under generous terms.
diff --git a/website/src/jade/header.jade b/website/src/jade/header.jade
index e07ca01e0..f9da61a27 100644
--- a/website/src/jade/header.jade
+++ b/website/src/jade/header.jade
@@ -115,7 +115,6 @@ mixin WritePage(Site, Author, Page)
li(class={active: Page.active.home}): a(href="/") Home
li(class={active: Page.active.docs}): a(href="/docs") Docs
li: a(href="http://api.spacy.io/displacy", target="_blank") Demo
- //li(class={active: Page.active.license}): a(href="/license") License
li(class={active: Page.active.blog}): a(href="/blog") Blog
main#content
block
diff --git a/website/src/jade/tutorials/add-a-language/index.jade b/website/src/jade/tutorials/add-a-language/index.jade
index cf9b90027..9b574a5ac 100644
--- a/website/src/jade/tutorials/add-a-language/index.jade
+++ b/website/src/jade/tutorials/add-a-language/index.jade
@@ -117,8 +117,8 @@ include ./meta.jade
details: summary: h4 Create frequencies
pre.language-bash: code
- $ python bin/get_freqs.py
- $ python bin/gather_freqs.py
+ | $ python bin/get_freqs.py
+ | $ python bin/gather_freqs.py
details: summary: h4 Brown clusters
From 7bb85996ece0e60231735eeada024c11538547bc Mon Sep 17 00:00:00 2001
From: Henning Peters
Date: Thu, 26 Nov 2015 09:53:37 +0100
Subject: [PATCH 11/13] add mit license link
---
website/src/jade/blog/introducing-spacy/index.jade | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/website/src/jade/blog/introducing-spacy/index.jade b/website/src/jade/blog/introducing-spacy/index.jade
index a7f45aab3..3d972e56c 100644
--- a/website/src/jade/blog/introducing-spacy/index.jade
+++ b/website/src/jade/blog/introducing-spacy/index.jade
@@ -19,4 +19,4 @@ include ./meta.jade
+TweetThis("Computers don't understand text. This is unfortunate, because that's what the web is mostly made of.", Meta.url)
p If none of that made any sense to you, here's the gist of it. Computers don't understand text. This is unfortunate, because that's what the web almost entirely consists of. We want to recommend people text based on other text they liked. We want to shorten text to display it on a mobile screen. We want to aggregate it, link it, filter it, categorise it, generate it and correct it.
- p spaCy provides a library of utility functions that help programmers build such products. It's commercial open source software: you can either use it under the AGPL, or you can buy a commercial license under generous terms.
+ p spaCy provides a library of utility functions that help programmers build such products. It's commercial open source software: you can either use it under the AGPL, or you can buy a commercial license under generous terms (Note: #[a(href="/blog/spacy-now-mit/") spaCy is now licensed under MIT]).
From 7d7175935545885b4bdbb2dd2eb63907342dbe2c Mon Sep 17 00:00:00 2001
From: Hugo
Date: Thu, 26 Nov 2015 16:08:22 +0200
Subject: [PATCH 12/13] Add Python 3.5
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 38d2e5dc8..4fd29e8ef 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,7 @@ Supports
* CPython 2.7
* CPython 3.4
+* CPython 3.5
* OSX
* Linux
* Cygwin
From 3c7928f6183bed3818451167a6e9162e852a019c Mon Sep 17 00:00:00 2001
From: Henning Peters
Date: Sun, 29 Nov 2015 10:08:26 +0100
Subject: [PATCH 13/13] fix broken link
---
website/src/jade/blog/how-spacy-works/index.jade | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/website/src/jade/blog/how-spacy-works/index.jade b/website/src/jade/blog/how-spacy-works/index.jade
index 0aafcefc5..0f6b05913 100644
--- a/website/src/jade/blog/how-spacy-works/index.jade
+++ b/website/src/jade/blog/how-spacy-works/index.jade
@@ -98,7 +98,7 @@ include ./meta.jade
h3 Dependency Parser
- p The parser uses the algorithm described in my #[a(href="parsing-english-in-python/") 2014 blog post]. This algorithm, shift-reduce dependency parsing, is becoming widely adopted due to its compelling speed/accuracy trade-off.
+ p The parser uses the algorithm described in my #[a(href="/blog/parsing-english-in-python/") 2014 blog post]. This algorithm, shift-reduce dependency parsing, is becoming widely adopted due to its compelling speed/accuracy trade-off.
p Some quick details about spaCy's take on this, for those who happen to know these models well. I'll write up a better description shortly.