mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
get travis running
This commit is contained in:
commit
dca532af13
35
fabfile.py
vendored
35
fabfile.py
vendored
|
@ -74,7 +74,6 @@ def web():
|
||||||
jade('home/index.jade', '')
|
jade('home/index.jade', '')
|
||||||
jade('docs/index.jade', 'docs/')
|
jade('docs/index.jade', 'docs/')
|
||||||
jade('blog/index.jade', 'blog/')
|
jade('blog/index.jade', 'blog/')
|
||||||
jade('tutorials/index.jade', 'tutorials/')
|
|
||||||
|
|
||||||
for collection in ('blog', 'tutorials'):
|
for collection in ('blog', 'tutorials'):
|
||||||
for post_dir in (Path(__file__).parent / 'website' / 'src' / 'jade' / collection).iterdir():
|
for post_dir in (Path(__file__).parent / 'website' / 'src' / 'jade' / collection).iterdir():
|
||||||
|
@ -85,7 +84,39 @@ def web():
|
||||||
|
|
||||||
|
|
||||||
def web_publish(assets_path):
|
def web_publish(assets_path):
|
||||||
local('aws s3 sync --delete --exclude "resources/*" website/site/ s3://spacy.io')
|
from boto.s3.connection import S3Connection, OrdinaryCallingFormat
|
||||||
|
|
||||||
|
site_path = 'website/site'
|
||||||
|
|
||||||
|
os.environ['S3_USE_SIGV4'] = 'True'
|
||||||
|
conn = S3Connection(host='s3.eu-central-1.amazonaws.com',
|
||||||
|
calling_format=OrdinaryCallingFormat())
|
||||||
|
bucket = conn.get_bucket('spacy.io', validate=False)
|
||||||
|
|
||||||
|
keys_left = set([k.name for k in bucket.list()
|
||||||
|
if not k.name.startswith('resources')])
|
||||||
|
|
||||||
|
for root, dirnames, filenames in os.walk(site_path):
|
||||||
|
for filename in filenames:
|
||||||
|
source = os.path.join(root, filename)
|
||||||
|
|
||||||
|
target = os.path.relpath(root, site_path)
|
||||||
|
if target == '.':
|
||||||
|
target = filename
|
||||||
|
elif filename != 'index.html':
|
||||||
|
target = os.path.join(target, filename)
|
||||||
|
|
||||||
|
key = bucket.new_key(target)
|
||||||
|
key.set_metadata('Content-Type', 'text/html')
|
||||||
|
key.set_contents_from_filename(source)
|
||||||
|
print('uploading %s' % target)
|
||||||
|
|
||||||
|
keys_left.remove(target)
|
||||||
|
|
||||||
|
for key_name in keys_left:
|
||||||
|
print('deleting %s' % key_name)
|
||||||
|
bucket.delete_key(key_name)
|
||||||
|
|
||||||
local('aws s3 sync --delete %s s3://spacy.io/resources' % assets_path)
|
local('aws s3 sync --delete %s s3://spacy.io/resources' % assets_path)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ include ./header
|
||||||
include ./mixins.jade
|
include ./mixins.jade
|
||||||
|
|
||||||
- var Page = InitPage(Site, Authors.spacy, "home", '404')
|
- var Page = InitPage(Site, Authors.spacy, "home", '404')
|
||||||
|
- Page.canonical_url = null
|
||||||
- Page.is_error = true
|
- Page.is_error = true
|
||||||
- Site.slogan = "404"
|
- Site.slogan = "404"
|
||||||
- Page.active = {}
|
- Page.active = {}
|
||||||
|
|
|
@ -4,7 +4,7 @@ include ./meta.jade
|
||||||
|
|
||||||
+WritePost(Meta)
|
+WritePost(Meta)
|
||||||
section.intro
|
section.intro
|
||||||
p Natural Language Processing moves fast, so maintaining a good library means constantly throwing things away. Most libraries are failing badly at this, as academics hate to editorialize. This post explains the problem, why it's so damaging, and why I wrote #[a(href="http://spacy.io") spaCy] to do things differently.
|
p Natural Language Processing moves fast, so maintaining a good library means constantly throwing things away. Most libraries are failing badly at this, as academics hate to editorialize. This post explains the problem, why it's so damaging, and why I wrote #[a(href="https://spacy.io") spaCy] to do things differently.
|
||||||
|
|
||||||
p Imagine: you try to use Google Translate, but it asks you to first select which model you want. The new, awesome deep-learning model is there, but so are lots of others. You pick one that sounds fancy, but it turns out it's a 20-year old experimental model trained on a corpus of oven manuals. When it performs little better than chance, you can't even tell from its output. Of course, Google Translate would not do this to you. But most Natural Language Processing libraries do, and it's terrible.
|
p Imagine: you try to use Google Translate, but it asks you to first select which model you want. The new, awesome deep-learning model is there, but so are lots of others. You pick one that sounds fancy, but it turns out it's a 20-year old experimental model trained on a corpus of oven manuals. When it performs little better than chance, you can't even tell from its output. Of course, Google Translate would not do this to you. But most Natural Language Processing libraries do, and it's terrible.
|
||||||
|
|
||||||
|
@ -12,7 +12,7 @@ include ./meta.jade
|
||||||
|
|
||||||
p Have a look through the #[a(href="http://gate.ac.uk/sale/tao/split.html") GATE software]. There's a lot there, developed over 12 years and many person-hours. But there's approximately zero curation. The philosophy is just to provide things. It's up to you to decide what to use.
|
p Have a look through the #[a(href="http://gate.ac.uk/sale/tao/split.html") GATE software]. There's a lot there, developed over 12 years and many person-hours. But there's approximately zero curation. The philosophy is just to provide things. It's up to you to decide what to use.
|
||||||
|
|
||||||
p This is bad. It's bad to provide an implementation of #[a(href="https://gate.ac.uk/sale/tao/splitch18.html") MiniPar], and have it just...sit there, with no hint that it's 20 years old and should not be used. The RASP parser, too. Why are these provided? Worse, why is there no warning? The #[a(href="http://webdocs.cs.ualberta.ca/~lindek/minipar.htm") Minipar homepage] puts the software in the right context:
|
p This is bad. It's bad to provide an implementation of #[a(href="https://gate.ac.uk/sale/tao/splitch18.html") MiniPar], and have it just...sit there, with no hint that it's 20 years old and should not be used. The RASP parser, too. Why are these provided? Worse, why is there no warning? The #[a(href="https://web.archive.org/web/20150907234221/http://webdocs.cs.ualberta.ca/~lindek/minipar.htm") Minipar homepage] puts the software in the right context:
|
||||||
|
|
||||||
blockquote
|
blockquote
|
||||||
p MINIPAR is a broad-coverage parser for the English language. An evaluation with the SUSANNE corpus shows that MINIPAR achieves about 88% precision and 80% recall with respect to dependency relationships. MINIPAR is very efficient, #[strong on a Pentium II 300 with 128MB memory], it parses about 300 words per second.
|
p MINIPAR is a broad-coverage parser for the English language. An evaluation with the SUSANNE corpus shows that MINIPAR achieves about 88% precision and 80% recall with respect to dependency relationships. MINIPAR is very efficient, #[strong on a Pentium II 300 with 128MB memory], it parses about 300 words per second.
|
||||||
|
@ -23,13 +23,13 @@ include ./meta.jade
|
||||||
|
|
||||||
h3 Why I didn't contribute to NLTK
|
h3 Why I didn't contribute to NLTK
|
||||||
|
|
||||||
p Various people have asked me why I decided to make a new Python NLP library, #[a(href="http://spacy.io") spaCy], instead of supporting the #[a(href="http://nltk.org") NLTK] project. This is the main reason. You can't contribute to a project if you believe that the first thing that they should do is throw almost all of it away. You should just make your own project, which is what I did.
|
p Various people have asked me why I decided to make a new Python NLP library, #[a(href="https://spacy.io") spaCy], instead of supporting the #[a(href="http://nltk.org") NLTK] project. This is the main reason. You can't contribute to a project if you believe that the first thing that they should do is throw almost all of it away. You should just make your own project, which is what I did.
|
||||||
p Have a look through #[a(href="http://www.nltk.org/py-modindex.html") the module list of NLTK]. It looks like there's a lot there, but there's not. What NLTK has is a decent tokenizer, some passable stemmers, a good implementation of the Punkt sentence boundary detector (after #[a(href="http://joelnothman.com/") Joel Nothman] rewrote it), some visualization tools, and some wrappers for other libraries. Nothing else is of any use.
|
p Have a look through #[a(href="http://www.nltk.org/py-modindex.html") the module list of NLTK]. It looks like there's a lot there, but there's not. What NLTK has is a decent tokenizer, some passable stemmers, a good implementation of the Punkt sentence boundary detector (after #[a(href="http://joelnothman.com/") Joel Nothman] rewrote it), some visualization tools, and some wrappers for other libraries. Nothing else is of any use.
|
||||||
|
|
||||||
p For instance, consider #[code nltk.parse]. You might think that amongst all this code there was something that could actually predict the syntactic structure of a sentence for you, but you would be wrong. There are wrappers for the BLLIP and Stanford parsers, and since March there's been an implementation of Nivre's 2003 transition-based dependency parser. Unfortunately no model is provided for it, as they rely on an external wrapper of an external learner, which is unsuitable for the structure of their problem. So the implementation is too slow to be actually useable.
|
p For instance, consider #[code nltk.parse]. You might think that amongst all this code there was something that could actually predict the syntactic structure of a sentence for you, but you would be wrong. There are wrappers for the BLLIP and Stanford parsers, and since March there's been an implementation of Nivre's 2003 transition-based dependency parser. Unfortunately no model is provided for it, as they rely on an external wrapper of an external learner, which is unsuitable for the structure of their problem. So the implementation is too slow to be actually useable.
|
||||||
|
|
||||||
p This problem is totally avoidable, if you just sit down and write good code, instead of stitching together external dependencies. I pointed NLTK to my tutorial describing #[a(href="http://spacy.io/blog/parsing-english-in-python/") how to implement a modern dependency parser], which includes a BSD-licensed implementation in 500 lines of Python. I was told "thanks but no thanks", and #[a(href="https://github.com/nltk/nltk/issues/694") the issue was abruptly closed]. Another researcher's offer from 2012 to implement this type of model also went #[a(href="http://arxiv.org/pdf/1409.7386v1.pdf") unanswered].
|
p This problem is totally avoidable, if you just sit down and write good code, instead of stitching together external dependencies. I pointed NLTK to my tutorial describing #[a(href="https://spacy.io/blog/parsing-english-in-python") how to implement a modern dependency parser], which includes a BSD-licensed implementation in 500 lines of Python. I was told "thanks but no thanks", and #[a(href="https://github.com/nltk/nltk/issues/694") the issue was abruptly closed]. Another researcher's offer from 2012 to implement this type of model also went #[a(href="http://arxiv.org/pdf/1409.7386v1.pdf") unanswered].
|
||||||
|
|
||||||
p The story in #[code nltk.tag] is similar. There are plenty of wrappers, for the external libraries that have actual taggers. The only actual tagger model they distribute is #[a(href="http://spacy.io/blog/part-of-speech-POS-tagger-in-python/") terrible]. Now it seems that #[a(href="https://github.com/nltk/nltk/issues/1063") NLTK does not even know how its POS tagger was trained]. The model is just this .pickle file that's been passed around for 5 years, its origins lost to time. It's not okay to offer this to people, to recommend they use it.
|
p The story in #[code nltk.tag] is similar. There are plenty of wrappers, for the external libraries that have actual taggers. The only actual tagger model they distribute is #[a(href="https://spacy.io/blog/part-of-speech-POS-tagger-in-python") terrible]. Now it seems that #[a(href="https://github.com/nltk/nltk/issues/1063") NLTK does not even know how its POS tagger was trained]. The model is just this .pickle file that's been passed around for 5 years, its origins lost to time. It's not okay to offer this to people, to recommend they use it.
|
||||||
|
|
||||||
p I think open source software should be very careful to make its limitations clear. It's a disservice to provide something that's much less useful than you imply. It's like offering your friend a lift and then not showing up. It's totally fine to not do something – so long as you never suggested you were going to do it. There are ways to do worse than nothing.
|
p I think open source software should be very careful to make its limitations clear. It's a disservice to provide something that's much less useful than you imply. It's like offering your friend a lift and then not showing up. It's totally fine to not do something – so long as you never suggested you were going to do it. There are ways to do worse than nothing.
|
||||||
|
|
|
@ -2,7 +2,7 @@ include ../../header.jade
|
||||||
include ./meta.jade
|
include ./meta.jade
|
||||||
|
|
||||||
mixin Displacy(sentence, caption_text, height)
|
mixin Displacy(sentence, caption_text, height)
|
||||||
- var url = "https://api.spacy.io/displacy/?full=" + sentence.replace(" ", "%20")
|
- var url = "https://api.spacy.io/displacy/?full=" + sentence.replace(/\s+/g, "%20")
|
||||||
|
|
||||||
.displacy
|
.displacy
|
||||||
iframe.displacy(src="/resources/displacy/robots.html" height=height)
|
iframe.displacy(src="/resources/displacy/robots.html" height=height)
|
||||||
|
@ -20,7 +20,7 @@ mixin Displacy(sentence, caption_text, height)
|
||||||
|
|
||||||
p A syntactic dependency parse is a kind of shallow meaning representation. It's an important piece of many language understanding and text processing technologies. Now that these representations can be computed quickly, and with increasingly high accuracy, they're being used in lots of applications – translation, sentiment analysis, and summarization are major application areas.
|
p A syntactic dependency parse is a kind of shallow meaning representation. It's an important piece of many language understanding and text processing technologies. Now that these representations can be computed quickly, and with increasingly high accuracy, they're being used in lots of applications – translation, sentiment analysis, and summarization are major application areas.
|
||||||
|
|
||||||
p I've been living and breathing similar representations for most of my career. But there's always been a problem: talking about these things is tough. Most people haven't thought much about grammatical structure, and the idea of them is inherently abstract. When I left academia to write #[a(href="http://spaCy.io") spaCy], I knew I wanted a good visualizer. Unfortunately, I also knew I'd never be the one to write it. I'm deeply graphically challenged. Fortunately, when working with #[a(href="http://ines.io") Ines] to build this site, she really nailed the problem, with a solution I'd never have thought of. I really love the result, which we're calling #[a(href="https://api.spacy.io/displacy") displaCy]:
|
p I've been living and breathing similar representations for most of my career. But there's always been a problem: talking about these things is tough. Most people haven't thought much about grammatical structure, and the idea of them is inherently abstract. When I left academia to write #[a(href="https://spacy.io") spaCy], I knew I wanted a good visualizer. Unfortunately, I also knew I'd never be the one to write it. I'm deeply graphically challenged. Fortunately, when working with #[a(href="http://ines.io") Ines] to build this site, she really nailed the problem, with a solution I'd never have thought of. I really love the result, which we're calling #[a(href="https://api.spacy.io/displacy") displaCy]:
|
||||||
|
|
||||||
+Displacy("Robots in popular culture are there to remind us of the awesomeness of unbounded human agency", "Click the button to full-screen and interact, or scroll to see the full parse.", 325)
|
+Displacy("Robots in popular culture are there to remind us of the awesomeness of unbounded human agency", "Click the button to full-screen and interact, or scroll to see the full parse.", 325)
|
||||||
|
|
||||||
|
|
|
@ -9,4 +9,4 @@
|
||||||
- Meta.links[0].name = 'Reddit'
|
- Meta.links[0].name = 'Reddit'
|
||||||
- Meta.links[0].title = 'Discuss on Reddit'
|
- Meta.links[0].title = 'Discuss on Reddit'
|
||||||
- Meta.links[0].url = "https://www.reddit.com/r/programming/comments/3hoj0b/displaying_linguistic_structure_with_css/"
|
- Meta.links[0].url = "https://www.reddit.com/r/programming/comments/3hoj0b/displaying_linguistic_structure_with_css/"
|
||||||
- Meta.image = "http://spacy.io/resources/img/displacy_screenshot.jpg"
|
- Meta.image = "https://spacy.io/resources/img/displacy_screenshot.jpg"
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
- Meta.headline = "Statistical NLP in Basic English"
|
- Meta.headline = "Statistical NLP in Basic English"
|
||||||
- Meta.description = "When I was little, my favorite TV shows all had talking computers. Now I’m big and there are still no talking computers, so I’m trying to make some myself. Well, we can make computers say things. But when we say things back, they don’t really understand. Why not?"
|
- Meta.description = "When I was little, my favorite TV shows all had talking computers. Now I’m big and there are still no talking computers, so I’m trying to make some myself. Well, we can make computers say things. But when we say things back, they don’t really understand. Why not?"
|
||||||
- Meta.date = "2015-08-24"
|
- Meta.date = "2015-08-24"
|
||||||
- Meta.url = "/blog/eli5-computers-learn-reading/"
|
- Meta.url = "/blog/eli5-computers-learn-reading"
|
||||||
- Meta.links = []
|
- Meta.links = []
|
||||||
//- Meta.links[0].id = 'reddit'
|
//- Meta.links[0].id = 'reddit'
|
||||||
//- Meta.links[0].name = "Reddit"
|
//- Meta.links[0].name = "Reddit"
|
||||||
|
|
|
@ -92,13 +92,13 @@ include ./meta.jade
|
||||||
|
|
||||||
h3 Part-of-speech Tagger
|
h3 Part-of-speech Tagger
|
||||||
|
|
||||||
p In 2013, I wrote a blog post describing #[a(href="/blog/part-of-speech-POS-tagger-in-python/") how to write a good part of speech tagger]. My recommendation then was to use greedy decoding with the averaged perceptron. I think this is still the best approach, so it's what I implemented in spaCy.
|
p In 2013, I wrote a blog post describing #[a(href="/blog/part-of-speech-POS-tagger-in-python") how to write a good part of speech tagger]. My recommendation then was to use greedy decoding with the averaged perceptron. I think this is still the best approach, so it's what I implemented in spaCy.
|
||||||
|
|
||||||
p The tutorial also recommends the use of Brown cluster features, and case normalization features, as these make the model more robust and domain independent. spaCy's tagger makes heavy use of these features.
|
p The tutorial also recommends the use of Brown cluster features, and case normalization features, as these make the model more robust and domain independent. spaCy's tagger makes heavy use of these features.
|
||||||
|
|
||||||
h3 Dependency Parser
|
h3 Dependency Parser
|
||||||
|
|
||||||
p The parser uses the algorithm described in my #[a(href="/blog/parsing-english-in-python/") 2014 blog post]. This algorithm, shift-reduce dependency parsing, is becoming widely adopted due to its compelling speed/accuracy trade-off.
|
p The parser uses the algorithm described in my #[a(href="/blog/parsing-english-in-python") 2014 blog post]. This algorithm, shift-reduce dependency parsing, is becoming widely adopted due to its compelling speed/accuracy trade-off.
|
||||||
|
|
||||||
p Some quick details about spaCy's take on this, for those who happen to know these models well. I'll write up a better description shortly.
|
p Some quick details about spaCy's take on this, for those who happen to know these models well. I'll write up a better description shortly.
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ include ../header.jade
|
||||||
|
|
||||||
+WritePage(Site, Authors.spacy, Page)
|
+WritePage(Site, Authors.spacy, Page)
|
||||||
section.intro.profile
|
section.intro.profile
|
||||||
p A lot of work has gone into #[strong spaCy], but no magic. We plan to keep no secrets. We want you to be able to #[a(href="/blog/spacy-now-mit") build your business] on #[strong spaCy] – so we want you to understand it. Tell us whether you do. #[span.social #[a(href="//twitter.com/" + Site.twitter, target="_blank") Twitter] #[a(href="mailto:contact@spacy.io") Contact us]]
|
p A lot of work has gone into #[strong spaCy], but no magic. We plan to keep no secrets. We want you to be able to #[a(href="/blog/spacy-now-mit") build your business] on #[strong spaCy] – so we want you to understand it. Tell us whether you do. #[span.social #[a(href="https://twitter.com/" + Site.twitter, target="_blank") Twitter] #[a(href="mailto:contact@spacy.io") Contact us]]
|
||||||
nav(role='navigation')
|
nav(role='navigation')
|
||||||
ul
|
ul
|
||||||
li #[a.button(href='#blogs') Blog]
|
li #[a.button(href='#blogs') Blog]
|
||||||
|
|
|
@ -19,4 +19,4 @@ include ./meta.jade
|
||||||
+TweetThis("Computers don't understand text. This is unfortunate, because that's what the web is mostly made of.", Meta.url)
|
+TweetThis("Computers don't understand text. This is unfortunate, because that's what the web is mostly made of.", Meta.url)
|
||||||
|
|
||||||
p If none of that made any sense to you, here's the gist of it. Computers don't understand text. This is unfortunate, because that's what the web almost entirely consists of. We want to recommend people text based on other text they liked. We want to shorten text to display it on a mobile screen. We want to aggregate it, link it, filter it, categorise it, generate it and correct it.
|
p If none of that made any sense to you, here's the gist of it. Computers don't understand text. This is unfortunate, because that's what the web almost entirely consists of. We want to recommend people text based on other text they liked. We want to shorten text to display it on a mobile screen. We want to aggregate it, link it, filter it, categorise it, generate it and correct it.
|
||||||
p spaCy provides a library of utility functions that help programmers build such products. It's commercial open source software: you can either use it under the AGPL, or you can buy a commercial license under generous terms (Note: #[a(href="/blog/spacy-now-mit/") spaCy is now licensed under MIT]).
|
p spaCy provides a library of utility functions that help programmers build such products. It's commercial open source software: you can either use it under the AGPL, or you can buy a commercial license under generous terms (Note: #[a(href="/blog/spacy-now-mit") spaCy is now licensed under MIT]).
|
||||||
|
|
|
@ -5,7 +5,7 @@ include ../../header.jade
|
||||||
+WritePost(Meta)
|
+WritePost(Meta)
|
||||||
//# AGPL not free enough: spaCy now under MIT, offering adaptation as a service
|
//# AGPL not free enough: spaCy now under MIT, offering adaptation as a service
|
||||||
|
|
||||||
p Three big announcements for #[a(href="http://spacy.io") spaCy], a Python library for industrial-strength natural language processing (NLP).
|
p Three big announcements for #[a(href="https://spacy.io") spaCy], a Python library for industrial-strength natural language processing (NLP).
|
||||||
|
|
||||||
ol
|
ol
|
||||||
li The founding team is doubling in size: I'd like to welcome my new co-founder, #[a(href="https://www.linkedin.com/profile/view?id=ADEAAADkZcYBnipeHOAS6HqrDBPK1IzAAVI64ds&authType=NAME_SEARCH&authToken=YYZ1&locale=en_US&srchid=3310922891443387747239&srchindex=1&srchtotal=16&trk=vsrp_people_res_name&trkInfo=VSRPsearchId%3A3310922891443387747239%2CVSRPtargetId%3A14968262%2CVSRPcmpt%3Aprimary%2CVSRPnm%3Atrue%2CauthType%3ANAME_SEARCH") Henning Peters].
|
li The founding team is doubling in size: I'd like to welcome my new co-founder, #[a(href="https://www.linkedin.com/profile/view?id=ADEAAADkZcYBnipeHOAS6HqrDBPK1IzAAVI64ds&authType=NAME_SEARCH&authToken=YYZ1&locale=en_US&srchid=3310922891443387747239&srchindex=1&srchtotal=16&trk=vsrp_people_res_name&trkInfo=VSRPsearchId%3A3310922891443387747239%2CVSRPtargetId%3A14968262%2CVSRPcmpt%3Aprimary%2CVSRPnm%3Atrue%2CauthType%3ANAME_SEARCH") Henning Peters].
|
||||||
|
|
|
@ -52,7 +52,7 @@ details
|
||||||
|
|
||||||
p The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set. We also map the tags to the simpler Google Universal POS Tag set.
|
p The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set. We also map the tags to the simpler Google Universal POS Tag set.
|
||||||
|
|
||||||
p Details #[a(href="https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124") here].
|
p Details #[a(href="https://github.com/honnibal/spaCy/blob/master/spacy/tagger.pyx") here].
|
||||||
|
|
||||||
details
|
details
|
||||||
summary: h4 Lemmatization
|
summary: h4 Lemmatization
|
||||||
|
|
|
@ -2,20 +2,20 @@
|
||||||
- Site.name = "spaCy.io"
|
- Site.name = "spaCy.io"
|
||||||
- Site.slogan = "Build Tomorrow's Language Technologies"
|
- Site.slogan = "Build Tomorrow's Language Technologies"
|
||||||
- Site.description = "spaCy is a library for industrial-strength text processing in Python. If you're a small company doing NLP, we want spaCy to seem like a minor miracle."
|
- Site.description = "spaCy is a library for industrial-strength text processing in Python. If you're a small company doing NLP, we want spaCy to seem like a minor miracle."
|
||||||
- Site.image = "http://spacy.io/resources/img/social.png"
|
- Site.image = "https://spacy.io/resources/img/social.png"
|
||||||
- Site.image_small = "http://spacy.io/resources/img/social_small.png"
|
- Site.image_small = "https://spacy.io/resources/img/social_small.png"
|
||||||
- Site.twitter = "spacy_io"
|
- Site.twitter = "spacy_io"
|
||||||
- Site.url = "http://spacy.io"
|
- Site.url = "https://spacy.io"
|
||||||
-
|
-
|
||||||
- Authors = {"matt": {}, "spacy": {}};
|
- Authors = {"matt": {}, "spacy": {}};
|
||||||
- Authors.matt.name = "Matthew Honnibal"
|
- Authors.matt.name = "Matthew Honnibal"
|
||||||
- Authors.matt.bio = "Matthew Honnibal is the author of the <a href=\"http://spacy.io\">spaCy</a> software and the sole founder of its parent company. He studied linguistics as an undergrad, and never thought he'd be a programmer. By 2009 he had a PhD in computer science, and in 2014 he left academia to found Syllogism Co. He's from Sydney and lives in Berlin."
|
- Authors.matt.bio = "Matthew Honnibal is the author of the <a href=\"https://spacy.io\">spaCy</a> software and the sole founder of its parent company. He studied linguistics as an undergrad, and never thought he'd be a programmer. By 2009 he had a PhD in computer science, and in 2014 he left academia to found Syllogism Co. He's from Sydney and lives in Berlin."
|
||||||
|
|
||||||
- Authors.matt.image = "/resources/img/matt.png"
|
- Authors.matt.image = "/resources/img/matt.png"
|
||||||
- Authors.matt.twitter = "honnibal"
|
- Authors.matt.twitter = "honnibal"
|
||||||
-
|
-
|
||||||
- Authors.spacy.name = "SpaCy.io"
|
- Authors.spacy.name = "SpaCy.io"
|
||||||
- Authors.spacy.bio = "<a href=\"http://spacy.io\">spaCy</a> is a library for industrial-strength natural language processing in Python and Cython. It features state-of-the-art speed and accuracy, a concise API, and great documentation. If you're a small company doing NLP, we want spaCy to seem like a minor miracle."
|
- Authors.spacy.bio = "<a href=\"https://spacy.io\">spaCy</a> is a library for industrial-strength natural language processing in Python and Cython. It features state-of-the-art speed and accuracy, a concise API, and great documentation. If you're a small company doing NLP, we want spaCy to seem like a minor miracle."
|
||||||
- Authors.spacy.image = "/resources/img/social_small.png"
|
- Authors.spacy.image = "/resources/img/social_small.png"
|
||||||
- Authors.spacy.twitter = "spacy_io"
|
- Authors.spacy.twitter = "spacy_io"
|
||||||
|
|
||||||
|
@ -27,10 +27,11 @@
|
||||||
- Page.active[type] = true;
|
- Page.active[type] = true;
|
||||||
- Page.links = [];
|
- Page.links = [];
|
||||||
- if (type == "home") {
|
- if (type == "home") {
|
||||||
- Page.url = "";
|
- Page.url = "/";
|
||||||
- } else {
|
- } else {
|
||||||
- Page.url = "/" + type;
|
- Page.url = "/" + type;
|
||||||
- }
|
- }
|
||||||
|
- Page.canonical_url = Site.url + Page.url;
|
||||||
-
|
-
|
||||||
- // Set defaults
|
- // Set defaults
|
||||||
- Page.description = Site.description;
|
- Page.description = Site.description;
|
||||||
|
@ -57,6 +58,7 @@
|
||||||
- Page.description = Meta.description
|
- Page.description = Meta.description
|
||||||
- Page.date = Meta.date
|
- Page.date = Meta.date
|
||||||
- Page.url = Meta.url
|
- Page.url = Meta.url
|
||||||
|
- Page.canonical_url = Site.url + Page.url;
|
||||||
- Page.active["blog"] = true
|
- Page.active["blog"] = true
|
||||||
- Page.links = Meta.links
|
- Page.links = Meta.links
|
||||||
- if (Meta.image != null) {
|
- if (Meta.image != null) {
|
||||||
|
@ -98,6 +100,8 @@ mixin WritePage(Site, Author, Page)
|
||||||
meta(property="og:site_name" content=Site.name)
|
meta(property="og:site_name" content=Site.name)
|
||||||
meta(property="article:published_time" content=getDate(Page.date).timestamp)
|
meta(property="article:published_time" content=getDate(Page.date).timestamp)
|
||||||
link(rel="stylesheet" href="/resources/css/style.css")
|
link(rel="stylesheet" href="/resources/css/style.css")
|
||||||
|
if Page.canonical_url
|
||||||
|
link(rel="canonical" href=Page.canonical_url)
|
||||||
|
|
||||||
//[if lt IE 9]><script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]
|
//[if lt IE 9]><script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script><![endif]
|
||||||
|
|
||||||
|
@ -158,10 +162,10 @@ mixin WritePost(Meta)
|
||||||
+WriteAuthorBio(Author)
|
+WriteAuthorBio(Author)
|
||||||
|
|
||||||
mixin WriteByline(Author, Meta)
|
mixin WriteByline(Author, Meta)
|
||||||
.subhead by #[a(href="//twitter.com/" + Author.twitter, rel="author" target="_blank") #{Author.name}] on #[time #{getDate(Meta.date).fulldate}]
|
.subhead by #[a(href="https://twitter.com/" + Author.twitter, rel="author" target="_blank") #{Author.name}] on #[time #{getDate(Meta.date).fulldate}]
|
||||||
|
|
||||||
mixin WriteShareLinks(headline, url, twitter, links)
|
mixin WriteShareLinks(headline, url, twitter, links)
|
||||||
a.button.button-twitter(href="http://twitter.com/share?text=" + headline + "&url=" + Site.url + url + "&via=" + twitter title="Share on Twitter" target="_blank")
|
a.button.button-twitter(href="https://twitter.com/share?text=" + headline.replace(/\s+/g, "%20") + "&url=" + Site.url + url + "&via=" + twitter title="Share on Twitter" target="_blank")
|
||||||
| Share on Twitter
|
| Share on Twitter
|
||||||
if links
|
if links
|
||||||
.discuss
|
.discuss
|
||||||
|
@ -174,11 +178,11 @@ mixin WriteShareLinks(headline, url, twitter, links)
|
||||||
| Discuss on #{link.name}
|
| Discuss on #{link.name}
|
||||||
|
|
||||||
mixin TweetThis(text, url)
|
mixin TweetThis(text, url)
|
||||||
p #[span #{text} #[a.share(href='http://twitter.com/share?text="' + text + '"&url=' + Site.url + url + '&via=' + Site.twitter title='Share on Twitter' target='_blank') Tweet]]
|
p #[span #{text} #[a.share(href='https://twitter.com/share?text="' + text.replace(/\s+/g, "%20") + '"&url=' + Site.url + url + '&via=' + Site.twitter title='Share on Twitter' target='_blank') Tweet]]
|
||||||
|
|
||||||
mixin WriteAuthorBio(Author)
|
mixin WriteAuthorBio(Author)
|
||||||
section.intro.profile
|
section.intro.profile
|
||||||
p #[img(src=Author.image)] !{Author.bio} #[span.social #[a(href="//twitter.com/" + Author.twitter target="_blank") Twitter]]
|
p #[img(src=Author.image alt=Author.name)] !{Author.bio} #[span.social #[a(href="https://twitter.com/" + Author.twitter target="_blank") Twitter]]
|
||||||
|
|
||||||
|
|
||||||
- var getDate = function(input) {
|
- var getDate = function(input) {
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
mixin Displacy(sentence, caption_text, height)
|
mixin Displacy(sentence, caption_text, height)
|
||||||
- var url = "https://api.spacy.io/displacy/?full=" + sentence.replace(" ", "%20")
|
- var url = "https://api.spacy.io/displacy/?full=" + sentence.replace(/\s+/g, "%20")
|
||||||
|
|
||||||
.displacy
|
.displacy
|
||||||
iframe.displacy(src="/resources/displacy/displacy_demo.html" height=height)
|
iframe.displacy(src="/resources/displacy/displacy_demo.html" height=height)
|
||||||
|
|
|
@ -58,4 +58,4 @@ mixin example(name)
|
||||||
ul
|
ul
|
||||||
li: a(href="/docs#api") API documentation
|
li: a(href="/docs#api") API documentation
|
||||||
li: a(href="/docs#tutorials") Tutorials
|
li: a(href="/docs#tutorials") Tutorials
|
||||||
li: a(href="/docs/#spec") Annotation specs
|
li: a(href="/docs#spec") Annotation specs
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
- var Meta = {}
|
- var Meta = {}
|
||||||
- Meta.author_id = 'spacy'
|
- Meta.author_id = 'spacy'
|
||||||
- Meta.headline = "Tutorial: Adding a language to spaCy"
|
- Meta.headline = "Adding a language to spaCy"
|
||||||
- Meta.description = "Long awaited documentation for adding a language to spaCy"
|
- Meta.description = "Long awaited documentation for adding a language to spaCy"
|
||||||
- Meta.date = "2015-08-18"
|
- Meta.date = "2015-08-18"
|
||||||
- Meta.url = "/tutorials/add-a-language"
|
- Meta.url = "/tutorials/add-a-language"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
- var Meta = {}
|
- var Meta = {}
|
||||||
- Meta.headline = "Tutorial: Load new word vectors"
|
- Meta.headline = "Load new word vectors"
|
||||||
- Meta.description = "Word vectors allow simple similarity queries, and drive many NLP applications. This tutorial explains how to load custom word vectors into spaCy, to make use of task or data-specific representations."
|
- Meta.description = "Word vectors allow simple similarity queries, and drive many NLP applications. This tutorial explains how to load custom word vectors into spaCy, to make use of task or data-specific representations."
|
||||||
- Meta.author_id = "matt"
|
- Meta.author_id = "matt"
|
||||||
- Meta.date = "2015-09-24"
|
- Meta.date = "2015-09-24"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
- var Meta = {}
|
- var Meta = {}
|
||||||
- Meta.headline = "Tutorial: Mark all adverbs, particularly for verbs of speech"
|
- Meta.headline = "Mark all adverbs, particularly for verbs of speech"
|
||||||
- Meta.author_id = 'matt'
|
- Meta.author_id = 'matt'
|
||||||
- Meta.description = "Let's say you're developing a proofreading tool, or possibly an IDE for writers. You're convinced by Stephen King's advice that adverbs are not your friend so you want to highlight all adverbs."
|
- Meta.description = "Let's say you're developing a proofreading tool, or possibly an IDE for writers. You're convinced by Stephen King's advice that adverbs are not your friend so you want to highlight all adverbs."
|
||||||
- Meta.date = "2015-08-18"
|
- Meta.date = "2015-08-18"
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
- var Meta = {}
|
- var Meta = {}
|
||||||
- Meta.headline = "Tutorial: Search Reddit for comments about Google doing something"
|
- Meta.headline = "Search Reddit for comments about Google doing something"
|
||||||
- Meta.description = "Example use of the spaCy NLP tools for data exploration. Here we will look for Reddit comments that describe Google doing something, i.e. discuss the company's actions. This is difficult, because other senses of \"Google\" now dominate usage of the word in conversation, particularly references to using Google products."
|
- Meta.description = "Example use of the spaCy NLP tools for data exploration. Here we will look for Reddit comments that describe Google doing something, i.e. discuss the company's actions. This is difficult, because other senses of \"Google\" now dominate usage of the word in conversation, particularly references to using Google products."
|
||||||
- Meta.author_id = "matt"
|
- Meta.author_id = "matt"
|
||||||
- Meta.date = "2015-08-18"
|
- Meta.date = "2015-08-18"
|
||||||
|
|
|
@ -4,7 +4,7 @@ include ./meta.jade
|
||||||
|
|
||||||
+WritePost(Meta)
|
+WritePost(Meta)
|
||||||
section.intro
|
section.intro
|
||||||
p #[a(href="http://spaCy.io") spaCy] is great for data exploration. Poking, prodding and sifting is fundamental to good data science. In this tutorial, we'll do a broad keword search of Twitter, and then sift through the live stream of tweets, zooming in on some topics and excluding others.
|
p #[a(href="https://spacy.io") spaCy] is great for data exploration. Poking, prodding and sifting is fundamental to good data science. In this tutorial, we'll do a broad keword search of Twitter, and then sift through the live stream of tweets, zooming in on some topics and excluding others.
|
||||||
|
|
||||||
p An example filter-function:
|
p An example filter-function:
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
- var Meta = {}
|
- var Meta = {}
|
||||||
- Meta.headline = "Tutorial: Finding Relevant Tweets"
|
- Meta.headline = "Finding Relevant Tweets"
|
||||||
- Meta.author_id = 'matt'
|
- Meta.author_id = 'matt'
|
||||||
- Meta.description = "In this tutorial, we will use word vectors to search for tweets about Jeb Bush. We'll do this by building up two word lists: one that represents the type of meanings in the Jeb Bush tweets, and another to help screen out irrelevant tweets that mention the common, ambiguous word 'bush'."
|
- Meta.description = "In this tutorial, we will use word vectors to search for tweets about Jeb Bush. We'll do this by building up two word lists: one that represents the type of meanings in the Jeb Bush tweets, and another to help screen out irrelevant tweets that mention the common, ambiguous word 'bush'."
|
||||||
- Meta.date = "2015-08-18"
|
- Meta.date = "2015-08-18"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user