mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
162 lines
9.3 KiB
Plaintext
162 lines
9.3 KiB
Plaintext
//- ----------------------------------
|
|
//- 💫 LANDING PAGE
|
|
//- ----------------------------------
|
|
|
|
include _includes/_mixins
|
|
|
|
header.o-header.u-pattern
|
|
h1.o-block-small.u-heading-0.u-text-shadow
|
|
| Industrial-strength#[br]
|
|
| Natural Language#[br]
|
|
| Processing
|
|
|
|
.o-block-small.u-text-medium.u-text-strong.u-text-shadow
|
|
| Thousands of researchers are trying to make#[br]
|
|
| computers understand text. They're succeeding.#[br]
|
|
| spaCy is a Python NLP library that helps you get#[br]
|
|
| their work out of papers and into production.
|
|
|
|
.o-inline-list
|
|
+button("/docs/#getting-started", true, "secondary")(target="_self") Install spaCy
|
|
|
|
main.o-main
|
|
.o-inline-list.u-padding-small.u-text-center.u-text-label.u-color-subtle.u-border-bottom
|
|
+a("https://github.com/" + SOCIAL.github + "/spaCy/releases") #[strong Latest Release:] v#{SPACY_VERSION}
|
|
|
|
+a("https://github.com/" + SOCIAL.github + "/spaCy") #[+icon("github")] #[strong #{SPACY_STARS}+ stars] on GitHub
|
|
|
|
+a("https://www.reddit.com/r/" + SOCIAL.reddit) #[+icon("reddit")] #[strong User Group] on Reddit
|
|
|
|
+grid.u-border-bottom
|
|
+grid-col("half").u-padding
|
|
+label Are you using spaCy?
|
|
+h(2)
|
|
+a("https://survey.spacy.io", true) Take the spaCy user survey
|
|
|
|
p.u-text-medium.
|
|
Two years after I started working on spaCy full time, I'm finally pushing forward with a #[strong 1.0 release]. It's also past time to take a bit of a census. I hope you'll take a few minutes to fill out this survey, to help me understand how you're using the library, and how it can be better.#[br]#[br]
|
|
|
|
#[strong Thanks for your support!]#[br]
|
|
#[strong.u-heading-3.u-color-theme Matt]
|
|
#[br]#[br]
|
|
|
|
#[+button("https://survey.spacy.io", true, "primary") Take the survey]
|
|
|
|
+grid-col("half").u-padding
|
|
+label The blog posts have moved
|
|
+h(2) Check out the new blog
|
|
|
|
p.u-text-medium We've updated the site to make it more focussed on the library itself. This will help us stay organised when we expand the tutorials section — by far the clearest message we've gotten from the survey so far. The blog posts have been moved to the new site for our consulting services, #[+a("https://explosion.ai", true) Explosion AI]. We've also updated our demos, and have open-sourced the services behind them. There are lots more releases to come. #[br]#[br]
|
|
|
|
+button("https://explosion.ai/blog", true, "primary") Go to the new blogs
|
|
|
|
+grid
|
|
+grid-col("half").u-padding
|
|
+h(2) Built for Production
|
|
p.u-text-medium Most AI software is built for research. Over the last ten years, we've used a lot of that software, and built some of it ourselves, especially for natural language processing (NLP). But the faster the research has moved, the more impatient we've become. We want to see advanced NLP technologies get out into great products, as the basis of great businesses. We built spaCy to make that happen.
|
|
|
|
+h(2) Easy and Powerful
|
|
p.u-text-medium For any NLP task, there are always lots of competing algorithms. We don't believe in implementing them all and letting you choose. Instead, we just implement one – the best one. When better algorithms are developed, we can update the library without breaking your code or bloating the API. This approach makes spaCy both #[strong easier] and #[strong more powerful] than a pluggable architecture. spaCy also features a #[strong unique whole-document design]. Where other NLP libraries rely on sentence detection as a pre-process, spaCy reads the whole document at once, making it much more robust to informal and poorly formatted text.
|
|
|
|
+h(2) Permissive open-source license (MIT)
|
|
p.u-text-medium We think spaCy is valuable software, so we made it free, to raise its value even higher. Making spaCy open-source puts us on the same side – we can tell you everything about how it works, and let you run it however you like. We think the software would be much less valuable as a service, which could disappear at any point.
|
|
|
|
+grid-col("half").u-padding-medium
|
|
.x-terminal
|
|
.x-terminal__icons: span
|
|
.u-padding-small.u-text-label.u-text-strong.u-text-center lightning_tour.py
|
|
|
|
+code.x-terminal__code.
|
|
# pip install spacy && python -m spacy.en.download
|
|
import spacy
|
|
|
|
# Load English tokenizer, tagger, parser, NER and word vectors
|
|
nlp = spacy.load('en')
|
|
# Process a document, of any size
|
|
text = open('war_and_peace.txt').read()
|
|
doc = nlp(text)
|
|
|
|
from spacy.attrs import *
|
|
# All strings mapped to integers, for easy export to numpy
|
|
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
|
|
|
from reddit_corpus import RedditComments
|
|
reddit = RedditComments('/path/to/reddit/corpus')
|
|
# Parse a stream of documents, with multi-threading (no GIL!)
|
|
# Processes over 100,000 tokens per second.
|
|
for doc in nlp.pipe(reddit.texts, batch_size=10000, n_threads=4):
|
|
# Multi-word expressions, such as names, dates etc
|
|
# can be merged into single tokens
|
|
for ent in doc.ents:
|
|
ent.merge(ent.root.tag_, ent.text, ent.ent_type_)
|
|
# Efficient, lossless serialization --- all annotations
|
|
# saved, same size as uncompressed text
|
|
byte_string = doc.to_bytes()
|
|
|
|
.o-block.u-text-center
|
|
+label spaCy is trusted by
|
|
|
|
.o-block
|
|
each row in logos
|
|
.o-inline-list.u-padding-medium
|
|
each logo in row
|
|
+a(logo[1])
|
|
img(src="/assets/img/logos/" + logo[0] + ".png" width="150").u-padding-medium
|
|
|
|
.o-block.u-text-center.u-padding
|
|
+label What our users say...
|
|
|
|
+grid
|
|
+grid-col("third")
|
|
<blockquote class="twitter-tweet" data-cards="hidden" data-lang="en"><p lang="en" dir="ltr">"Dead Code Should be Buried" <a href="http://t.co/AxfZRRz8nB">http://t.co/AxfZRRz8nB</a> by <a href="https://twitter.com/honnibal">@honnibal</a> on NLP tools & new Python library spaCy <a href="http://t.co/C9f798R3aO">http://t.co/C9f798R3aO</a> looks nice!</p>— Andrej Karpathy (@karpathy) <a href="https://twitter.com/karpathy/status/640098689894232064">September 5, 2015</a></blockquote>
|
|
|
|
+grid-col("third")
|
|
<blockquote class="twitter-tweet" data-cards="hidden" data-lang="en"><p lang="en" dir="ltr">spaCy seems pretty exciting to me - and it is clear that NLTK has not kept up with <a href="https://twitter.com/hashtag/NLP?src=hash">#NLP</a>. <a href="http://t.co/mUPFUMLrbo">http://t.co/mUPFUMLrbo</a> <a href="https://twitter.com/hashtag/python?src=hash">#python</a> <a href="https://twitter.com/hashtag/datascience?src=hash">#datascience</a></p>— Alex Engler (@AlexCEngler) <a href="https://twitter.com/AlexCEngler/status/648537133544833025">September 28, 2015</a></blockquote>
|
|
|
|
+grid-col("third")
|
|
<blockquote class="twitter-tweet" data-lang="en"><p lang="en" dir="ltr">I wish I'd known about nlp.pipe about two weeks ago. Nice feature in <a href="https://twitter.com/Spacy">@spacy</a> to parallelize your NLP pipeline.</p>— Matti Lyra (@mattilyra) <a href="https://twitter.com/mattilyra/status/704753660329369600">March 1, 2016</a></blockquote>
|
|
|
|
+grid
|
|
+grid-col("half").u-padding
|
|
+label Benchmarks
|
|
+h(2) State-of-the-art speed and accuracy
|
|
|
|
p.u-text-medium spaCy is committed to rigorous evaluation under standard methodology. Two peer-reviewed papers in 2015 confirm that it offers the #[strong fastest syntactic parser in the world] and that #[strong its accuracy is within 2% of the best] available.
|
|
|
|
sup #[+a("http://aclweb.org/anthology/P/P15/P15-1038.pdf") [1]], #[+a("https://aclweb.org/anthology/D/D15/D15-1162.pdf") [2]], #[+a("http://homes.cs.washington.edu/~lsz/papers/llz-naacl16.pdf") [3]]
|
|
|
|
+grid-col("half").u-padding
|
|
+table(["System", "Language", "Accuracy", "Speed (WPS)"])
|
|
+row
|
|
+cell
|
|
!=partial("_includes/_logo", { logo_size: "tiny" })
|
|
+cell #[strong Cython]
|
|
+cell #[strong 91.8]
|
|
+cell #[strong 13,963]
|
|
|
|
+row
|
|
+cell ClearNLP
|
|
+cell Java
|
|
+cell 91.7
|
|
+cell 10,271
|
|
|
|
+row
|
|
+cell CoreNLP
|
|
+cell Java
|
|
+cell 89.6
|
|
+cell 8,602
|
|
|
|
+row
|
|
+cell MATE
|
|
+cell Java
|
|
+cell 92.5
|
|
+cell 550
|
|
|
|
+row
|
|
+cell Turbo
|
|
+cell C++
|
|
+cell 92.4
|
|
+cell 349
|
|
|
|
include _includes/_newsletter
|